From 96850ae470e0476ac69a13e50a154144dee55223 Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Fri, 24 Nov 2017 17:07:33 +0100
Subject: [PATCH 001/150] initial commit to check fork

---
 tools/install_boost.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/tools/install_boost.sh b/tools/install_boost.sh
index a803778..4f30ddf 100755
--- a/tools/install_boost.sh
+++ b/tools/install_boost.sh
@@ -33,14 +33,14 @@
 # http://www.boost.org/more/getting_started/unix-variants.html#prepare-to-use-a-boost-library-binary
 # https://software.intel.com/en-us/articles/building-the-boost-library-to-run-natively-on-intelr-xeon-phitm-coprocessor
 
-DOWNLOAD_PATH=$HOME/Downloads
-INSTALL_PATH=$HOME/Software
+DOWNLOAD_PATH=$HOME/boost/
+INSTALL_PATH=$HOME/software
 NO_MIC=false # set to true, to disable building Boost for Xeon Phi
 BASHRC_FILE=$HOME/.bashrc # set to /dev/null to disable, or to any other file to manually merge the needed changes into your .bashrc 
 
 BOOST_BUILD_OPTIONS="-j8" # concurrent build with up to 8 commands
 BOOST_NAME=boost
-BOOST_VERSION=1_56_0
+BOOST_VERSION=1_65_1
 BOOST_MIC_SUFFIX=mic
 BOOST_ARCHIVE=${BOOST_NAME}_${BOOST_VERSION} # NOTE: without tar.bz2
 

From 359a6cb8f267b084c1fd6d24f397738a076da230 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Fri, 24 Nov 2017 17:07:33 +0100
Subject: [PATCH 002/150] initial commit to check fork

---
 tools/install_boost.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/install_boost.sh b/tools/install_boost.sh
index a803778..4f30ddf 100755
--- a/tools/install_boost.sh
+++ b/tools/install_boost.sh
@@ -33,14 +33,14 @@
 # http://www.boost.org/more/getting_started/unix-variants.html#prepare-to-use-a-boost-library-binary
 # https://software.intel.com/en-us/articles/building-the-boost-library-to-run-natively-on-intelr-xeon-phitm-coprocessor
 
-DOWNLOAD_PATH=$HOME/Downloads
-INSTALL_PATH=$HOME/Software
+DOWNLOAD_PATH=$HOME/boost/
+INSTALL_PATH=$HOME/software
 NO_MIC=false # set to true, to disable building Boost for Xeon Phi
 BASHRC_FILE=$HOME/.bashrc # set to /dev/null to disable, or to any other file to manually merge the needed changes into your .bashrc 
 
 BOOST_BUILD_OPTIONS="-j8" # concurrent build with up to 8 commands
 BOOST_NAME=boost
-BOOST_VERSION=1_56_0
+BOOST_VERSION=1_65_1
 BOOST_MIC_SUFFIX=mic
 BOOST_ARCHIVE=${BOOST_NAME}_${BOOST_VERSION} # NOTE: without tar.bz2
 

From a03e64186bec99a8fb97a8b53c6305b62ea0b79b Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Thu, 14 Dec 2017 17:11:48 +0100
Subject: [PATCH 003/150] tds changes

---
 .gitignore                      | 1 +
 Jamroot                         | 4 ++++
 include/ham/misc/migratable.hpp | 4 ++--
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index db5131f..f83f745 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 bin/*
 benchmark/results/*
+cmake-build-debug/*
diff --git a/Jamroot b/Jamroot
index f4bb375..39f7e78 100644
--- a/Jamroot
+++ b/Jamroot
@@ -116,6 +116,10 @@ exe active_msgs
 	: active_msgs.cpp
 	;
 
+exe active_msgs_over_file
+	: active_msgs_over_file.cpp boost_program_options
+	;
+
 exe ham_offload
 	: ham_offload.cpp ham_offload_scif boost_program_options
 #	: <library>/mpi//mpi <define>HAM_COMM_MPI
diff --git a/include/ham/misc/migratable.hpp b/include/ham/misc/migratable.hpp
index 012a99e..9ed002e 100644
--- a/include/ham/misc/migratable.hpp
+++ b/include/ham/misc/migratable.hpp
@@ -28,12 +28,12 @@ class migratable
 	 //: value(std::forward<T>(arg)) // NOTE: compatible types are allowed
 	 : value(std::forward<Compatible>(arg)) // NOTE: compatible types are allowed
 	{ 
-//		std::cout << "migratable-ctor: " << value << std::endl;
+		std::cout << "migratable-ctor: " << value << std::endl;
 	}
 
 	operator const T& () const
 	{
-//		std::cout << "migratable-conversion: " << value << std::endl;
+		std::cout << "migratable-conversion: " << value << std::endl;
 		return value;
 	}
 private:

From 9a1a2b41209c1ee836988b3936850a1ae23f5e67 Mon Sep 17 00:00:00 2001
From: Deppisch <bzcdeppi@pvs-pc06.zib.de>
Date: Thu, 14 Dec 2017 17:11:48 +0100
Subject: [PATCH 004/150] tds changes

---
 .gitignore                      | 1 +
 Jamroot                         | 4 ++++
 include/ham/misc/migratable.hpp | 4 ++--
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index db5131f..f83f745 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 bin/*
 benchmark/results/*
+cmake-build-debug/*
diff --git a/Jamroot b/Jamroot
index f4bb375..39f7e78 100644
--- a/Jamroot
+++ b/Jamroot
@@ -116,6 +116,10 @@ exe active_msgs
 	: active_msgs.cpp
 	;
 
+exe active_msgs_over_file
+	: active_msgs_over_file.cpp boost_program_options
+	;
+
 exe ham_offload
 	: ham_offload.cpp ham_offload_scif boost_program_options
 #	: <library>/mpi//mpi <define>HAM_COMM_MPI
diff --git a/include/ham/misc/migratable.hpp b/include/ham/misc/migratable.hpp
index 012a99e..9ed002e 100644
--- a/include/ham/misc/migratable.hpp
+++ b/include/ham/misc/migratable.hpp
@@ -28,12 +28,12 @@ class migratable
 	 //: value(std::forward<T>(arg)) // NOTE: compatible types are allowed
 	 : value(std::forward<Compatible>(arg)) // NOTE: compatible types are allowed
 	{ 
-//		std::cout << "migratable-ctor: " << value << std::endl;
+		std::cout << "migratable-ctor: " << value << std::endl;
 	}
 
 	operator const T& () const
 	{
-//		std::cout << "migratable-conversion: " << value << std::endl;
+		std::cout << "migratable-conversion: " << value << std::endl;
 		return value;
 	}
 private:

From 9d579d5ea6b1ece605e7309d657ed1b33c2d28df Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Thu, 14 Dec 2017 18:28:18 +0100
Subject: [PATCH 005/150] added actives_msgs_over_file.cpp

---
 src/active_msgs_over_file.cpp | 195 ++++++++++++++++++++++++++++++++++
 1 file changed, 195 insertions(+)
 create mode 100644 src/active_msgs_over_file.cpp

diff --git a/src/active_msgs_over_file.cpp b/src/active_msgs_over_file.cpp
new file mode 100644
index 0000000..9b3cc4e
--- /dev/null
+++ b/src/active_msgs_over_file.cpp
@@ -0,0 +1,195 @@
+// modified by Daniel Deppisch (deppisch@zib.de) from:
+// active_msgs.cpp
+// Copyright (c) 2013-2014 Matthias Noack (ma.noack.pr@gmail.com)
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <boost/program_options.hpp>
+#include <cstring>
+#include <iostream>
+#include <fstream>
+
+#include "ham/msg/active_msg_base.hpp"
+#include "ham/msg/execution_policy.hpp"
+#include "ham/msg/active_msg.hpp"
+#include "ham/misc/migratable.hpp"
+
+using namespace std;
+
+
+namespace ham {
+
+    template<>
+    class migratable<std::string> {
+    public:
+        migratable(const migratable &) = default;
+
+        migratable(migratable &&) = default;
+
+        migratable &operator=(const migratable &) = default;
+
+        migratable &operator=(migratable &&) = default;
+
+        // forward compatible arg into T's ctor
+        template<typename Compatible>
+        migratable(Compatible &&arg) {
+            std::cout << "migratable<string>-ctor: " << arg << std::endl;
+            std::strcpy(value, arg.c_str());
+        }
+
+        operator std::string() const {
+            std::cout << "migratable<string>-conversion: " << value << std::endl;
+            return value;
+        }
+
+    private:
+        char value[256];
+    };
+} // namespace ham
+
+// a simple message type for testing
+
+class MsgA : public ham::msg::active_msg<MsgA> {
+public:
+	void operator()() {
+                cout << "MsgA::operator() successfully called." << endl;
+		// the message could perform some task here
+		// and possible send back a result afterwards, e.g. by
+		// - using data transferred as member inside the message	
+		// - calling some communication layer
+		// - ...
+	}
+	
+	// the message could include members that are safe to transfer between the communicating entities
+};
+
+class MsgB : public ham::msg::active_msg<MsgB> {
+public:
+    MsgB(const char* t_in, std::string text2)
+    : text2(text2)
+    {
+        std::strcpy(text, t_in);
+    }
+
+    void operator()() {
+        cout << "MsgB::operator() successfully called." << endl;
+        cout << "Text: " << text << endl;
+        cout << "Text2: " << static_cast<std::string>(text2) << endl;
+        // the message could perform some task here
+        // and possible send back a result afterwards, e.g. by
+        // - using data transferred as member inside the message
+        // - calling some communication layer
+        // - ...
+    }
+    // the message could include members that are safe to transfer between the communicating entities
+private:
+    char text[256];
+    ham::migratable<string> text2;
+};
+
+// a simple test which simulates a communication channel via filesystem
+// of course, this does NOT test the communication backend
+// this may be used to write and read a message from filesystem to simulate communication between different binaries without a supported backend
+
+// write message to file and shut down
+template<typename Msg>
+bool write_active_msg(Msg& func, std::string const & filename)
+{
+        size_t msgSize = sizeof(func);
+	
+
+        std::ofstream b_stream(filename.c_str(), std::fstream::out | std::fstream::binary);
+
+        if (b_stream) {
+            b_stream.write(reinterpret_cast<char*>(&func), msgSize);
+            return (b_stream.good());
+        }
+
+        return false;
+}
+
+// read message from file and execute
+bool read_active_msg(std::string const & filename)
+{
+    std::ifstream b_stream(filename.c_str(), std::fstream::in | std::fstream::binary);
+    b_stream.seekg(0, ios::end);
+    int bufferSize = b_stream.tellg();
+    char* buffer = new char[bufferSize];
+    b_stream.seekg(0, ios::beg);
+
+    if (!b_stream.read(buffer, bufferSize)) {
+        cout << "ERROR: reading file " << filename << " failed" << endl;
+        return false;
+    }
+
+    // simulate reading from the channel, thereby we cast the buffer back to the known base class of all active messages
+    auto functor = *reinterpret_cast<ham::msg::active_msg_base*>(buffer);
+
+    // This is where the magick happens.
+    // Calling the buffer as an active_msg_base functor with the receive buffer
+    // as argument triggers a handler look-up, followed by the execution of
+    // that handler (which is defined by the execution policy of the actual
+    // message type). The handler can perform a safe upcast of the buffer to
+    // the actual type of the message and directly execute it as functor,
+    // enqeue it somewhere for further processing, or whatever a policy
+    // specifies.
+    functor(buffer);
+
+    delete [] buffer;
+
+    return true;
+}
+
+
+int main (int argc, char * argv[]) {
+
+	// initialise active message handler address conversion data
+	ham::msg::msg_handler_registry::init();
+
+	// print message registry data
+	ham::msg::msg_handler_registry::print_handler_map(std::cout); // generated at static-init-time
+	ham::msg::msg_handler_registry::print_handler_vector(std::cout); // generated by the init-call above
+
+
+
+    // filename to be used
+    std::string filename;
+    std::string text;
+
+    // command line handling
+    boost::program_options::options_description desc("Options");
+        desc.add_options()
+          ("file,f", boost::program_options::value<std::string>(&filename), "specify file name (default: \"msgfile\"")
+          ("write,w", "make this process write a message to file")
+          ("read,r", "make this process read a message from file")
+          ("help,h", "print this help information")
+          ("text,t", boost::program_options::value<std::string>(&text), "add some text to display when executing message");
+
+    boost::program_options::variables_map vm;
+    boost::program_options::store(boost::program_options::command_line_parser(argc, argv).options(desc).allow_unregistered().run(), vm);
+    boost::program_options::notify(vm);
+
+    if (!vm.count("file")) {
+        filename  = "msgfile";
+    }
+
+    // simple message type
+    MsgA fA;
+    // extended message type
+    MsgB fB(text.c_str(), "asdfasdasd");
+
+    if(vm.count("write")) {
+        if(vm.count("text")) {
+            write_active_msg(fB, filename);
+        } else {
+            write_active_msg(fA, filename);
+        }
+    } else if (vm.count("read")) {
+        read_active_msg(filename);
+    } else {
+        cout << "ERROR: did not specify whether process should write or read." << endl;
+    }
+
+	return 0;	
+}
+

From 0a3a2c23aabc91fba56ebcdfa94994f13b175feb Mon Sep 17 00:00:00 2001
From: Deppisch <bzcdeppi@pvs-pc06.zib.de>
Date: Thu, 14 Dec 2017 18:28:18 +0100
Subject: [PATCH 006/150] added actives_msgs_over_file.cpp

---
 src/active_msgs_over_file.cpp | 195 ++++++++++++++++++++++++++++++++++
 1 file changed, 195 insertions(+)
 create mode 100644 src/active_msgs_over_file.cpp

diff --git a/src/active_msgs_over_file.cpp b/src/active_msgs_over_file.cpp
new file mode 100644
index 0000000..9b3cc4e
--- /dev/null
+++ b/src/active_msgs_over_file.cpp
@@ -0,0 +1,195 @@
+// modified by Daniel Deppisch (deppisch@zib.de) from:
+// active_msgs.cpp
+// Copyright (c) 2013-2014 Matthias Noack (ma.noack.pr@gmail.com)
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <boost/program_options.hpp>
+#include <cstring>
+#include <iostream>
+#include <fstream>
+
+#include "ham/msg/active_msg_base.hpp"
+#include "ham/msg/execution_policy.hpp"
+#include "ham/msg/active_msg.hpp"
+#include "ham/misc/migratable.hpp"
+
+using namespace std;
+
+
+namespace ham {
+
+    template<>
+    class migratable<std::string> {
+    public:
+        migratable(const migratable &) = default;
+
+        migratable(migratable &&) = default;
+
+        migratable &operator=(const migratable &) = default;
+
+        migratable &operator=(migratable &&) = default;
+
+        // forward compatible arg into T's ctor
+        template<typename Compatible>
+        migratable(Compatible &&arg) {
+            std::cout << "migratable<string>-ctor: " << arg << std::endl;
+            std::strcpy(value, arg.c_str());
+        }
+
+        operator std::string() const {
+            std::cout << "migratable<string>-conversion: " << value << std::endl;
+            return value;
+        }
+
+    private:
+        char value[256];
+    };
+} // namespace ham
+
+// a simple message type for testing
+
+class MsgA : public ham::msg::active_msg<MsgA> {
+public:
+	void operator()() {
+                cout << "MsgA::operator() successfully called." << endl;
+		// the message could perform some task here
+		// and possible send back a result afterwards, e.g. by
+		// - using data transferred as member inside the message	
+		// - calling some communication layer
+		// - ...
+	}
+	
+	// the message could include members that are safe to transfer between the communicating entities
+};
+
+class MsgB : public ham::msg::active_msg<MsgB> {
+public:
+    MsgB(const char* t_in, std::string text2)
+    : text2(text2)
+    {
+        std::strcpy(text, t_in);
+    }
+
+    void operator()() {
+        cout << "MsgB::operator() successfully called." << endl;
+        cout << "Text: " << text << endl;
+        cout << "Text2: " << static_cast<std::string>(text2) << endl;
+        // the message could perform some task here
+        // and possible send back a result afterwards, e.g. by
+        // - using data transferred as member inside the message
+        // - calling some communication layer
+        // - ...
+    }
+    // the message could include members that are safe to transfer between the communicating entities
+private:
+    char text[256];
+    ham::migratable<string> text2;
+};
+
+// a simple test which simulates a communication channel via filesystem
+// of course, this does NOT test the communication backend
+// this may be used to write and read a message from filesystem to simulate communication between different binaries without a supported backend
+
+// write message to file and shut down
+template<typename Msg>
+bool write_active_msg(Msg& func, std::string const & filename)
+{
+        size_t msgSize = sizeof(func);
+	
+
+        std::ofstream b_stream(filename.c_str(), std::fstream::out | std::fstream::binary);
+
+        if (b_stream) {
+            b_stream.write(reinterpret_cast<char*>(&func), msgSize);
+            return (b_stream.good());
+        }
+
+        return false;
+}
+
+// read message from file and execute
+bool read_active_msg(std::string const & filename)
+{
+    std::ifstream b_stream(filename.c_str(), std::fstream::in | std::fstream::binary);
+    b_stream.seekg(0, ios::end);
+    int bufferSize = b_stream.tellg();
+    char* buffer = new char[bufferSize];
+    b_stream.seekg(0, ios::beg);
+
+    if (!b_stream.read(buffer, bufferSize)) {
+        cout << "ERROR: reading file " << filename << " failed" << endl;
+        return false;
+    }
+
+    // simulate reading from the channel, thereby we cast the buffer back to the known base class of all active messages
+    auto functor = *reinterpret_cast<ham::msg::active_msg_base*>(buffer);
+
+    // This is where the magick happens.
+    // Calling the buffer as an active_msg_base functor with the receive buffer
+    // as argument triggers a handler look-up, followed by the execution of
+    // that handler (which is defined by the execution policy of the actual
+    // message type). The handler can perform a safe upcast of the buffer to
+    // the actual type of the message and directly execute it as functor,
+    // enqeue it somewhere for further processing, or whatever a policy
+    // specifies.
+    functor(buffer);
+
+    delete [] buffer;
+
+    return true;
+}
+
+
+int main (int argc, char * argv[]) {
+
+	// initialise active message handler address conversion data
+	ham::msg::msg_handler_registry::init();
+
+	// print message registry data
+	ham::msg::msg_handler_registry::print_handler_map(std::cout); // generated at static-init-time
+	ham::msg::msg_handler_registry::print_handler_vector(std::cout); // generated by the init-call above
+
+
+
+    // filename to be used
+    std::string filename;
+    std::string text;
+
+    // command line handling
+    boost::program_options::options_description desc("Options");
+        desc.add_options()
+          ("file,f", boost::program_options::value<std::string>(&filename), "specify file name (default: \"msgfile\"")
+          ("write,w", "make this process write a message to file")
+          ("read,r", "make this process read a message from file")
+          ("help,h", "print this help information")
+          ("text,t", boost::program_options::value<std::string>(&text), "add some text to display when executing message");
+
+    boost::program_options::variables_map vm;
+    boost::program_options::store(boost::program_options::command_line_parser(argc, argv).options(desc).allow_unregistered().run(), vm);
+    boost::program_options::notify(vm);
+
+    if (!vm.count("file")) {
+        filename  = "msgfile";
+    }
+
+    // simple message type
+    MsgA fA;
+    // extended message type
+    MsgB fB(text.c_str(), "asdfasdasd");
+
+    if(vm.count("write")) {
+        if(vm.count("text")) {
+            write_active_msg(fB, filename);
+        } else {
+            write_active_msg(fA, filename);
+        }
+    } else if (vm.count("read")) {
+        read_active_msg(filename);
+    } else {
+        cout << "ERROR: did not specify whether process should write or read." << endl;
+    }
+
+	return 0;	
+}
+

From 1b35b8fde8df8f10e5b35883d3271f1472345e83 Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Tue, 27 Mar 2018 14:25:05 +0200
Subject: [PATCH 007/150] jamroot switch scif to mpi, uncomment ctor output
 from migratable

---
 Jamroot                         | 6 +++---
 include/ham/misc/migratable.hpp | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/Jamroot b/Jamroot
index 39f7e78..1d27769 100644
--- a/Jamroot
+++ b/Jamroot
@@ -121,9 +121,9 @@ exe active_msgs_over_file
 	;
 
 exe ham_offload
-	: ham_offload.cpp ham_offload_scif boost_program_options
-#	: <library>/mpi//mpi <define>HAM_COMM_MPI
-	: <library>scif <define>HAM_COMM_SCIF
+	: ham_offload.cpp ham_offload_mpi boost_program_options
+	: <library>/mpi//mpi <define>HAM_COMM_MPI
+#	: <library>scif <define>HAM_COMM_SCIF
 	;
 
 exe ham_offload_explicit
diff --git a/include/ham/misc/migratable.hpp b/include/ham/misc/migratable.hpp
index 9ed002e..0a31b42 100644
--- a/include/ham/misc/migratable.hpp
+++ b/include/ham/misc/migratable.hpp
@@ -28,12 +28,12 @@ class migratable
 	 //: value(std::forward<T>(arg)) // NOTE: compatible types are allowed
 	 : value(std::forward<Compatible>(arg)) // NOTE: compatible types are allowed
 	{ 
-		std::cout << "migratable-ctor: " << value << std::endl;
+		// std::cout << "migratable-ctor: " << value << std::endl;
 	}
 
 	operator const T& () const
 	{
-		std::cout << "migratable-conversion: " << value << std::endl;
+		// std::cout << "migratable-conversion: " << value << std::endl;
 		return value;
 	}
 private:

From fdce48fd44d2339bb947c381069fa79c0b00907a Mon Sep 17 00:00:00 2001
From: bemdeppi <bemdeppi@blogin1.hsn.hlrn.de>
Date: Tue, 27 Mar 2018 14:25:05 +0200
Subject: [PATCH 008/150] jamroot switch scif to mpi, uncomment ctor output
 from migratable

---
 Jamroot                         | 6 +++---
 include/ham/misc/migratable.hpp | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/Jamroot b/Jamroot
index 39f7e78..1d27769 100644
--- a/Jamroot
+++ b/Jamroot
@@ -121,9 +121,9 @@ exe active_msgs_over_file
 	;
 
 exe ham_offload
-	: ham_offload.cpp ham_offload_scif boost_program_options
-#	: <library>/mpi//mpi <define>HAM_COMM_MPI
-	: <library>scif <define>HAM_COMM_SCIF
+	: ham_offload.cpp ham_offload_mpi boost_program_options
+	: <library>/mpi//mpi <define>HAM_COMM_MPI
+#	: <library>scif <define>HAM_COMM_SCIF
 	;
 
 exe ham_offload_explicit
diff --git a/include/ham/misc/migratable.hpp b/include/ham/misc/migratable.hpp
index 9ed002e..0a31b42 100644
--- a/include/ham/misc/migratable.hpp
+++ b/include/ham/misc/migratable.hpp
@@ -28,12 +28,12 @@ class migratable
 	 //: value(std::forward<T>(arg)) // NOTE: compatible types are allowed
 	 : value(std::forward<Compatible>(arg)) // NOTE: compatible types are allowed
 	{ 
-		std::cout << "migratable-ctor: " << value << std::endl;
+		// std::cout << "migratable-ctor: " << value << std::endl;
 	}
 
 	operator const T& () const
 	{
-		std::cout << "migratable-conversion: " << value << std::endl;
+		// std::cout << "migratable-conversion: " << value << std::endl;
 		return value;
 	}
 private:

From 34848b591d15db0a92a1ada857dccb64a02f76d0 Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Thu, 29 Mar 2018 13:40:58 +0200
Subject: [PATCH 009/150] initial commit of mpi_rma_dynamic prototype

---
 include/ham/offload/offload.hpp     | 18 +++++++++++++++
 include/ham/offload/offload_msg.hpp | 34 ++++++++++++++++++++++++++++-
 2 files changed, 51 insertions(+), 1 deletion(-)

diff --git a/include/ham/offload/offload.hpp b/include/ham/offload/offload.hpp
index a84d338..a3cff70 100644
--- a/include/ham/offload/offload.hpp
+++ b/include/ham/offload/offload.hpp
@@ -235,6 +235,11 @@ future<void> put(T* local_source, buffer_ptr<T>& remote_dest, size_t n)
 	
 	return result;
 #endif
+#ifdef SOME_COOL_VAR_FOR_MPI_RMA_DYN // compile-time integration pending
+    future<void> result(comm.allocate_request(remote_dest.node()));
+	HAM_DEBUG( HAM_LOG << "offload::put(): initiating RMA put..." << std::endl; )
+	comm.send_data_async(result.get_request(), local_source, remote_dest, n);
+#endif
 }
 
 template<typename T>
@@ -268,6 +273,11 @@ future<void> get(buffer_ptr<T> remote_source, T* local_dest, size_t n)
 
 	return result;
 #endif
+#ifdef SOME_COOL_VAR_FOR_MPI_RMA_DYN // compile-time integration pending
+	future<void> result(comm.allocate_request(remote_dest.node()));
+	HAM_DEBUG( HAM_LOG << "offload::put(): initiating RMA get..." << std::endl; )
+	comm.recv_data_async(result.get_request(), remote_source, local_dest, n);
+#endif
 }
 
 template<typename T>
@@ -328,6 +338,14 @@ void copy_sync(buffer_ptr<T> source, buffer_ptr<T> dest, size_t n)
 	read_result.get();
 	write_result.get();
 #endif
+#ifdef SOME_COOL_VAR_FOR_MPI_RMA_DYN // compile-integration pending
+	future<void> result(comm.allocate_request(source.node()));
+	HAM_DEBUG( HAM_LOG << "offload::copy_sync(): initiating copy between " << source.node() << " and " << dest.node() << std::endl; )
+	SEND READ_MSG to source (maybe introduce new copy_msg)
+	MAKE SURE there is no winlock on dest from host
+
+	comm.send_data_async(result.get_request(), local_source, remote_dest, n);
+#endif
 }
 #endif
 
diff --git a/include/ham/offload/offload_msg.hpp b/include/ham/offload/offload_msg.hpp
index c42ffb8..845dd08 100644
--- a/include/ham/offload/offload_msg.hpp
+++ b/include/ham/offload/offload_msg.hpp
@@ -6,6 +6,7 @@
 #ifndef ham_offload_offload_msg_hpp
 #define ham_offload_offload_msg_hpp
 
+#include <mpi.h>
 #include "ham/msg/active_msg.hpp"
 #include "ham/msg/execution_policy.hpp"
 #include "ham/misc/constants.hpp"
@@ -80,6 +81,7 @@ class offload_msg
 	}
 };
 
+// should not be used by MPI_RMA_COMMUNICATOR since one-sided put is used
 template<typename T, template<class> class ExecutionPolicy = default_execution_policy>
 class offload_write_msg
 	: public active_msg<offload_write_msg<T, ExecutionPolicy>, ExecutionPolicy>
@@ -106,6 +108,7 @@ class offload_write_msg
 	
 };
 
+// should not be used by MPI_RMA_COMMUNICATOR since one-sided put is used
 template<typename T, template<class> class ExecutionPolicy = default_execution_policy>
 class offload_read_msg
 	: public active_msg<offload_read_msg<T, ExecutionPolicy>, ExecutionPolicy>
@@ -116,7 +119,7 @@ class offload_read_msg
 
 	void operator()() //const
 	{
-		communicator::instance().send_data(local_source, buffer_ptr<T>(nullptr, remote_node), n);  // NOTE: Why nullptr? This is for two-sided communicators, so we do not know the remote address, but match a receive operation that has the address.
+		communicator::instance().send_data(local_source, buffer_ptr<T>(nullptr, remote_node), n); // NOTE: Why nullptr? This is for two-sided communicators, so we do not know the remote address, but match a receive operation that has the address.
 		
 		// send a result message to tell the sender, that the transfer is done
 		if (req.valid()) {
@@ -131,6 +134,35 @@ class offload_read_msg
 	size_t n;
 };
 
+
+// TODO(daniel, high priority): implement offload_copy_msg, copy with one-sided rma needs a msg containing ptrs for source+target
+//#ifdef SOME_COOL_VAR_FOR_MPI_RMA_DYN // compile-integration pending
+    template<typename T, template<class> class ExecutionPolicy = default_execution_policy>
+    class offload_rma_copy_msg
+            : public active_msg<offload_rma_copy_msg<T, ExecutionPolicy>, ExecutionPolicy>
+    {
+    public:
+        offload_rma_copy_msg(communicator::request req, node_t remote_node, MPI_Aint remote_addr,T* local_source, size_t n)
+                : req(req), remote_node(remote_node), remote_addr(remote_addr), local_source(local_source), n(n) { }
+
+        void operator()() //const
+        {
+            communicator::instance().send_data(local_source, buffer_ptr<T>(nullptr, remote_node, remote_addr), n); // NOTE: Why nullptr? This is for two-sided communicators, so we do not know the remote address, but match a receive operation that has the address.
+
+            // send a result message to tell the sender, that the transfer is done
+            if (req.valid()) {
+                req.send_result((void*)&n, sizeof n);
+            }
+        }
+    private:
+        communicator::request req; // TODO(improvement, high priority): use a subset of req here!
+
+        node_t remote_node;
+        MPI_Aint remote_addr;
+        T* local_source;
+        size_t n;
+    };
+//#endif
 } // namespace detail
 } // namespace offload
 } // namespace ham

From e5c46ed4953df083d8d3e7c9e7edb1aa9259e77f Mon Sep 17 00:00:00 2001
From: Deppisch <bzcdeppi@pvs-pc06.zib.de>
Date: Thu, 29 Mar 2018 13:40:58 +0200
Subject: [PATCH 010/150] initial commit of mpi_rma_dynamic prototype

---
 include/ham/offload/offload.hpp     | 18 +++++++++++++++
 include/ham/offload/offload_msg.hpp | 34 ++++++++++++++++++++++++++++-
 2 files changed, 51 insertions(+), 1 deletion(-)

diff --git a/include/ham/offload/offload.hpp b/include/ham/offload/offload.hpp
index a84d338..a3cff70 100644
--- a/include/ham/offload/offload.hpp
+++ b/include/ham/offload/offload.hpp
@@ -235,6 +235,11 @@ future<void> put(T* local_source, buffer_ptr<T>& remote_dest, size_t n)
 	
 	return result;
 #endif
+#ifdef SOME_COOL_VAR_FOR_MPI_RMA_DYN // compile-time integration pending
+    future<void> result(comm.allocate_request(remote_dest.node()));
+	HAM_DEBUG( HAM_LOG << "offload::put(): initiating RMA put..." << std::endl; )
+	comm.send_data_async(result.get_request(), local_source, remote_dest, n);
+#endif
 }
 
 template<typename T>
@@ -268,6 +273,11 @@ future<void> get(buffer_ptr<T> remote_source, T* local_dest, size_t n)
 
 	return result;
 #endif
+#ifdef SOME_COOL_VAR_FOR_MPI_RMA_DYN // compile-time integration pending
+	future<void> result(comm.allocate_request(remote_dest.node()));
+	HAM_DEBUG( HAM_LOG << "offload::put(): initiating RMA get..." << std::endl; )
+	comm.recv_data_async(result.get_request(), remote_source, local_dest, n);
+#endif
 }
 
 template<typename T>
@@ -328,6 +338,14 @@ void copy_sync(buffer_ptr<T> source, buffer_ptr<T> dest, size_t n)
 	read_result.get();
 	write_result.get();
 #endif
+#ifdef SOME_COOL_VAR_FOR_MPI_RMA_DYN // compile-integration pending
+	future<void> result(comm.allocate_request(source.node()));
+	HAM_DEBUG( HAM_LOG << "offload::copy_sync(): initiating copy between " << source.node() << " and " << dest.node() << std::endl; )
+	SEND READ_MSG to source (maybe introduce new copy_msg)
+	MAKE SURE there is no winlock on dest from host
+
+	comm.send_data_async(result.get_request(), local_source, remote_dest, n);
+#endif
 }
 #endif
 
diff --git a/include/ham/offload/offload_msg.hpp b/include/ham/offload/offload_msg.hpp
index c42ffb8..845dd08 100644
--- a/include/ham/offload/offload_msg.hpp
+++ b/include/ham/offload/offload_msg.hpp
@@ -6,6 +6,7 @@
 #ifndef ham_offload_offload_msg_hpp
 #define ham_offload_offload_msg_hpp
 
+#include <mpi.h>
 #include "ham/msg/active_msg.hpp"
 #include "ham/msg/execution_policy.hpp"
 #include "ham/misc/constants.hpp"
@@ -80,6 +81,7 @@ class offload_msg
 	}
 };
 
+// should not be used by MPI_RMA_COMMUNICATOR since one-sided put is used
 template<typename T, template<class> class ExecutionPolicy = default_execution_policy>
 class offload_write_msg
 	: public active_msg<offload_write_msg<T, ExecutionPolicy>, ExecutionPolicy>
@@ -106,6 +108,7 @@ class offload_write_msg
 	
 };
 
+// should not be used by MPI_RMA_COMMUNICATOR since one-sided put is used
 template<typename T, template<class> class ExecutionPolicy = default_execution_policy>
 class offload_read_msg
 	: public active_msg<offload_read_msg<T, ExecutionPolicy>, ExecutionPolicy>
@@ -116,7 +119,7 @@ class offload_read_msg
 
 	void operator()() //const
 	{
-		communicator::instance().send_data(local_source, buffer_ptr<T>(nullptr, remote_node), n);  // NOTE: Why nullptr? This is for two-sided communicators, so we do not know the remote address, but match a receive operation that has the address.
+		communicator::instance().send_data(local_source, buffer_ptr<T>(nullptr, remote_node), n); // NOTE: Why nullptr? This is for two-sided communicators, so we do not know the remote address, but match a receive operation that has the address.
 		
 		// send a result message to tell the sender, that the transfer is done
 		if (req.valid()) {
@@ -131,6 +134,35 @@ class offload_read_msg
 	size_t n;
 };
 
+
+// TODO(daniel, high priority): implement offload_copy_msg, copy with one-sided rma needs a msg containing ptrs for source+target
+//#ifdef SOME_COOL_VAR_FOR_MPI_RMA_DYN // compile-integration pending
+    template<typename T, template<class> class ExecutionPolicy = default_execution_policy>
+    class offload_rma_copy_msg
+            : public active_msg<offload_rma_copy_msg<T, ExecutionPolicy>, ExecutionPolicy>
+    {
+    public:
+        offload_rma_copy_msg(communicator::request req, node_t remote_node, MPI_Aint remote_addr,T* local_source, size_t n)
+                : req(req), remote_node(remote_node), remote_addr(remote_addr), local_source(local_source), n(n) { }
+
+        void operator()() //const
+        {
+            communicator::instance().send_data(local_source, buffer_ptr<T>(nullptr, remote_node, remote_addr), n); // NOTE: Why nullptr? This is for two-sided communicators, so we do not know the remote address, but match a receive operation that has the address.
+
+            // send a result message to tell the sender, that the transfer is done
+            if (req.valid()) {
+                req.send_result((void*)&n, sizeof n);
+            }
+        }
+    private:
+        communicator::request req; // TODO(improvement, high priority): use a subset of req here!
+
+        node_t remote_node;
+        MPI_Aint remote_addr;
+        T* local_source;
+        size_t n;
+    };
+//#endif
 } // namespace detail
 } // namespace offload
 } // namespace ham

From da9c4e964437c864dabc364660a4b36e79888b69 Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Thu, 29 Mar 2018 13:43:49 +0200
Subject: [PATCH 011/150] initial commit of mpi_rma_dynamic prototype

---
 .../ham/net/communicator_mpi_rma_dynamic.hpp  | 368 ++++++++++++++++++
 1 file changed, 368 insertions(+)
 create mode 100644 include/ham/net/communicator_mpi_rma_dynamic.hpp

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
new file mode 100644
index 0000000..d1b1add
--- /dev/null
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -0,0 +1,368 @@
+// Copyright (c) 2013-2014 Matthias Noack (ma.noack.pr@gmail.com)
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef ham_net_communicator_mpi_hpp
+#define ham_net_communicator_mpi_hpp
+
+#include <mpi.h>
+
+#include <cassert>
+#include <cstring> // memcpy
+#include <stdlib.h> // posix_memalign
+
+#include "ham/misc/constants.hpp"
+#include "ham/misc/resource_pool.hpp"
+#include "ham/misc/types.hpp"
+#include "ham/util/debug.hpp"
+#include "ham/util/log.hpp"
+
+namespace ham {
+namespace net {
+
+template<typename T>
+class buffer_ptr {
+public:
+	buffer_ptr();
+    buffer_ptr(T* ptr, node_t node) : ptr_(ptr), node_(node), mpi_address_(0) { }
+	buffer_ptr(T* ptr, node_t node, MPI_Aint mpi_address) : ptr_(ptr), node_(node), mpi_address_(mpi_address) { }
+
+
+	T* get() { return ptr_; }
+	node_t node() { return node_; }
+    MPI_Aint get_mpi_address() { return mpi_address_; }
+
+    // element access
+	T& operator [] (size_t i);
+
+	// basic pointer arithmetic to address sub-buffers
+	buffer_ptr<T> operator+(size_t off)
+	{
+		return buffer_ptr(ptr_ + off, node_);
+	}
+
+private:
+	T* ptr_;
+	node_t node_;
+    MPI_Aint mpi_address_;
+};
+
+class node_descriptor
+{
+public:
+	//node_descriptor() : name(MPI_MAX_PROCESSOR_NAME, 0) {}
+
+	//const std::string& name() const { return name_; }
+	const char* name() const { return name_; }
+private:
+	//std::string name_; // TODO(improvement): unify node description for all back-ends, NOTE: std::string is not trivally transferable
+	char name_[MPI_MAX_PROCESSOR_NAME + 1];
+
+	friend class net::communicator;
+};
+
+class communicator {
+public:
+	// externally used interface of request must be shared across all communicator-implementations
+	class request {
+	public:
+		request() : valid_(false) {} // instantiate invalid
+		
+		request(node_t target_node, node_t source_node, size_t send_buffer_index, size_t recv_buffer_index)
+		 : target_node(target_node), source_node(source_node), valid_(true), send_buffer_index(send_buffer_index), recv_buffer_index(recv_buffer_index), req_count(0)
+		{}
+
+		// return true if request was finished
+        // will not work as intended for rma ops, no equivalent to test() available for remote completion
+		bool test()
+		{
+			int flag = 0;
+			MPI_Testall(req_count, mpi_reqs, &flag, MPI_STATUS_IGNORE); // just test the receive request, since the send belonging to the request triggers the remote send that is received
+
+            if(uses_rma)
+            {
+                HAM_DEBUG( HAM_LOG << "request::test(), warning: may give false positive on rma remote completion" << std::endl; )
+            }
+
+            return flag != 0;
+		}
+
+		void* get() // blocks
+		{
+			HAM_DEBUG( HAM_LOG << "request::get(), before MPI_Waitall()" << std::endl; )
+			MPI_Waitall(req_count, mpi_reqs, MPI_STATUS_IGNORE); // must wait for all requests to satisfy the standard
+			HAM_DEBUG( HAM_LOG << "request::get(), after MPI_Waitall()" << std::endl; )
+            if(uses_rma)
+            {
+                MPI_Win_unlock(target_node, rma_win);
+            }
+			return static_cast<void*>(&communicator::instance().peers[target_node].msg_buffers[recv_buffer_index]);
+		}
+
+		template<class T>
+		void send_result(T* result_msg, size_t size)
+		{
+			assert(communicator::this_node() == target_node); // this assert fails if send_result is called from the wrong side
+			
+			// TODO(improvement, low priority): better go through communicator, such that no MPI calls are anywhere else
+			MPI_Send(result_msg, size, MPI_BYTE, source_node, constants::RESULT_TAG, MPI_COMM_WORLD);
+			//communicator::instance().send_msg(source_node, source_buffer_index, NO_BUFFER_INDEX, result_msg, size);
+		}
+
+		bool valid() const
+		{
+			return valid_;
+		}
+
+        bool uses_rma() const
+        {
+            return uses_rma_;
+        }
+
+		MPI_Request& next_mpi_request()
+		{
+			HAM_DEBUG( HAM_LOG << "next_mpi_request(): this=" << this << ", req_count=" << req_count << ", NUM_REQUESTS=" << NUM_REQUESTS << std::endl; )
+			assert(req_count < NUM_REQUESTS);
+			return mpi_reqs[req_count++]; // NOTE: post-increment
+		}
+
+		node_t target_node;
+		node_t source_node;
+		bool valid_;
+        bool uses_rma;
+
+		// only needed by the sender
+		enum { NUM_REQUESTS = 3 };
+		
+		size_t send_buffer_index; // buffer to use for sending the message
+		size_t recv_buffer_index; // buffer to use for receiving the result
+		size_t req_count;
+		
+	private:
+		MPI_Request mpi_reqs[NUM_REQUESTS]; // for sending the msg, receiving the result, and an associated data transfer
+	}; // class request
+
+	typedef request& request_reference_type;
+	typedef const request& request_const_reference_type;
+
+	communicator(int argc, char* argv[])
+	{
+		HAM_DEBUG( std::cout << "communicator::communicator(): initialising MPI" << std::endl; )
+
+		instance_ = this;
+		int p;
+		MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &p);
+		if (p != MPI_THREAD_MULTIPLE)
+		{
+			std::cerr << "Could not initialise MPI with MPI_THREAD_MULTIPLE, MPI_Init_thread() returned " << p << std::endl;
+		}
+		HAM_DEBUG( std::cout << "communicator::communicator(): initialising MPI ..." << std::endl; )
+
+		int t;
+		MPI_Comm_rank(MPI_COMM_WORLD, &t);
+		this_node_ = t;
+		MPI_Comm_size(MPI_COMM_WORLD, &t);
+		nodes_ = t;
+		host_node_ = 0; // TODO(improvement): make configureable, like for SCIF
+        MPI_Win_create_dynamic(MPI_INFO_NULL, MPI_COMM_WORLD, &rma_win);
+
+		HAM_DEBUG( std::cout << "communicator::communicator(): initialising MPI done" << std::endl; )
+
+		peers = new mpi_peer[nodes_];
+		
+		// start of node descriptor code:
+		node_descriptions.resize(nodes_);
+		
+		// build own node descriptor
+		node_descriptor node_description;
+		int count;
+		MPI_Get_processor_name(node_description.name_, &count);
+		node_description.name_[count] = 0x0; // null terminate
+
+//		char hostname[MPI_MAX_PROCESSOR_NAME + 1];
+//		MPI_Get_processor_name(hostname, &count);
+//		hostname[count] = 0x0; // null terminate
+//		node_description.name_.assign(hostname, count);
+
+		// append rank for testing:
+		//node_description.name_[count] = 48 + this_node_;
+		//node_description.name_[count+1] = 0x0;
+
+		// communicate descriptors between nodes
+		HAM_DEBUG( HAM_LOG << "communicator::communicator(): gathering node descriptions" << std::endl; )
+		//MPI_Alltoall(&node_description, sizeof(node_descriptor), MPI_BYTE, node_descriptions.data(), sizeof(node_descriptor), MPI_BYTE, MPI_COMM_WORLD);
+		MPI_Allgather(&node_description, sizeof(node_descriptor), MPI_BYTE, node_descriptions.data(), sizeof(node_descriptor), MPI_BYTE, MPI_COMM_WORLD);
+		HAM_DEBUG( HAM_LOG << "communicator::communicator(): gathering node descriptions done" << std::endl; )
+		
+		if (is_host()) {
+			for (node_t i = 1; i < nodes_; ++i) { // TODO(improvement): needs to be changed when host-rank becomes configurable
+				// allocate buffers
+				peers[i].msg_buffers = allocate_buffer<msg_buffer>(constants::MSG_BUFFERS, this_node_);
+				// fill resource pools
+				for(size_t j = constants::MSG_BUFFERS; j > 0; --j) {
+					peers[i].buffer_pool.add(j-1);
+				}
+			}
+		}
+	}
+
+	~communicator()
+	{
+		MPI_Finalize(); // TODO(improvement): check on error and create output if there was one
+		HAM_DEBUG( HAM_LOG << "~communicator" << std::endl; )
+	}
+
+
+	request allocate_request(node_t remote_node)
+	{
+		HAM_DEBUG( HAM_LOG << "communicator::allocate_next_request(): remote_node = " << remote_node << std::endl; )
+
+		const size_t send_buffer_index = peers[remote_node].buffer_pool.allocate();
+		const size_t recv_buffer_index = peers[remote_node].buffer_pool.allocate();
+
+		return { remote_node, this_node_, send_buffer_index, recv_buffer_index };
+	}
+
+	void free_request(request& req)
+	{
+		assert(req.valid());
+		assert(req.source_node == this_node_);
+	
+		mpi_peer& peer = peers[req.target_node];
+
+		peer.buffer_pool.free(req.send_buffer_index);
+		peer.buffer_pool.free(req.recv_buffer_index);
+		req.valid_ = false;
+	}
+
+public:
+	void send_msg(request_reference_type req, void* msg, size_t size)
+	{
+		// copy message from caller into transfer buffer
+		void* msg_buffer = static_cast<void*>(&peers[req.target_node].msg_buffers[req.send_buffer_index]);
+		memcpy(msg_buffer, msg, size);
+		MPI_Isend(msg_buffer, size, MPI_BYTE, req.target_node, constants::DEFAULT_TAG, MPI_COMM_WORLD, &req.next_mpi_request());
+	}
+	
+	// to be used by the offload target's main loop: synchronously receive one message at a time
+	// NOTE: the local static receive buffer!
+	void* recv_msg_host(void* msg = nullptr, size_t size = constants::MSG_SIZE)
+	{
+		static msg_buffer buffer; // NOTE !
+		MPI_Recv(&buffer, size, MPI_BYTE, host_node_, constants::DEFAULT_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+		return static_cast<void*>(&buffer);
+	}
+
+	// trigger receiving the result of a message on the sending side
+	void recv_result(request_reference_type req)
+	{
+		// nothing todo here, since this communicator implementation uses one-sided communication
+		// the data is already where it is expected (in the buffer referenced in req)
+		MPI_Irecv(static_cast<void*>(&peers[req.target_node].msg_buffers[req.recv_buffer_index]), constants::MSG_SIZE, MPI_BYTE, req.target_node, constants::RESULT_TAG, MPI_COMM_WORLD, &req.next_mpi_request());
+		return;
+	}
+
+	template<typename T>
+	void send_data(T* local_source, buffer_ptr<T> remote_dest, size_t size)
+	{
+		//MPI_Send((void*)local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), constants::DATA_TAG, MPI_COMM_WORLD);
+        MPI_Win_lock(MPI_LOCK_EXCLUSIVE, remote_dest.node(), 0, rma_win);
+        MPI_Put(local_source, size, MPI_BYTE, remote_dest.node(), (void *) remote_dest.get_mpi_address(), size, MPI_BYTE, rma_win);
+        MPI_Win_unlock(remote_dest.node(), rma_win);
+	}
+
+	// to be used by the host
+	template<typename T>
+	void send_data_async(request_reference_type req, T* local_source, buffer_ptr<T> remote_dest, size_t size)
+	{
+		//MPI_Isend((void*)local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), constants::DATA_TAG, MPI_COMM_WORLD, &req.next_mpi_request());
+        req.uses_rma = true;
+
+        MPI_Win_lock(MPI_LOCK_EXCLUSIVE, remote_dest.node(), 0, rma_win);
+        MPI_Rput(local_source, size, MPI_BYTE, remote_dest.node(), (void *) remote_dest.get_mpi_address(), size, MPI_BYTE, rma_win, &re.next_mpi_request());
+	}
+
+
+	template<typename T>
+	void recv_data(buffer_ptr<T> remote_source, T* local_dest, size_t size)
+	{
+		//MPI_Recv((void*)local_dest, size * sizeof(T), MPI_BYTE, remote_source.node(), constants::DATA_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+        MPI_Get(remote_source, size, MPI_BYTE, remote_source.node(), (void *) remote_dest.get_mpi_address(), size, MPI_BYTE, rma_win);
+        MPI_Win_flush(remote_source.node(), rma_win);
+	}
+	
+	// to be used by the host
+	template<typename T>
+	void recv_data_async(request_reference_type req, buffer_ptr<T> remote_source, T* local_dest, size_t size)
+	{
+		//MPI_Irecv(static_cast<void*>(local_dest), size * sizeof(T), MPI_BYTE, remote_source.node(), constants::DATA_TAG, MPI_COMM_WORLD, &req.next_mpi_request());
+        req.uses_rma = true;
+        MPI_RGet(remote_source, size, MPI_BYTE, remote_source.node(), (void *) remote_dest.get_mpi_address(), size, MPI_BYTE, rma_win, &req.next_mpi_request());
+	}
+
+	template<typename T>
+	buffer_ptr<T> allocate_buffer(const size_t n, node_t source_node)
+	{
+		T* ptr;
+		//int err =
+		posix_memalign((void**)&ptr, constants::CACHE_LINE_SIZE, n * sizeof(T));
+        MPI_Aint mpi_address;
+        MPI_Win_attach(rma_win, (void *) &mpi_address, n * sizeof(T));
+		// NOTE: no ctor is called
+		return buffer_ptr<T>(ptr, this_node_, mpi_address);
+	}
+
+	template<typename T>
+	void free_buffer(buffer_ptr<T> ptr)
+	{
+		assert(ptr.node() == this_node_);
+		// NOTE: no dtor is called
+        MPI_Win_detach(rma_win, ptr.get());
+		free(static_cast<void*>(ptr.get()));
+	}
+
+	static communicator& instance() { return *instance_; }
+	static node_t this_node() { return instance().this_node_; }
+	static size_t num_nodes() { return instance().nodes_; }
+	bool is_host() { return this_node_ == 0; } // TODO(improvement): ham_address == ham_host_address ; }
+	bool is_host(node_t node) { return node == 0; } // TODO(improvement): node == ham_host_address; }
+
+	static const node_descriptor& get_node_description(node_t node)
+	{
+		return instance().node_descriptions[node];
+	}
+
+private:
+	static communicator* instance_;
+	node_t this_node_;
+	size_t nodes_;
+	node_t host_node_;
+	std::vector<node_descriptor> node_descriptions; // not as member in peer below, because Allgather is used to exchange node descriptions
+    MPI_Win rma_win; // globally shared dynamic window for rma ops
+		
+	struct mpi_peer {
+		buffer_ptr<msg_buffer> msg_buffers; // buffers used for MPI_ISend and IRecv by the sender
+
+		// needed by sender to manage which buffers are in use and which are free
+		// just manages indices, that can be used by
+		detail::resource_pool<size_t> buffer_pool;
+	};
+	
+	mpi_peer* peers;
+};
+
+template<typename T>
+buffer_ptr<T>::buffer_ptr() : buffer_ptr(nullptr, communicator::this_node()) { }
+
+template<typename T>
+T& buffer_ptr<T>::operator[](size_t i)
+{
+	assert(node_ == communicator::this_node());
+	return ptr_[i];
+}
+
+} // namespace net
+} // namespace ham
+
+#endif // ham_net_communicator_mpi_hpp

From 590fa6a6bb46a75c2a85826614562b68e7fbe421 Mon Sep 17 00:00:00 2001
From: Deppisch <bzcdeppi@pvs-pc06.zib.de>
Date: Thu, 29 Mar 2018 13:43:49 +0200
Subject: [PATCH 012/150] initial commit of mpi_rma_dynamic prototype

---
 .../ham/net/communicator_mpi_rma_dynamic.hpp  | 368 ++++++++++++++++++
 1 file changed, 368 insertions(+)
 create mode 100644 include/ham/net/communicator_mpi_rma_dynamic.hpp

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
new file mode 100644
index 0000000..d1b1add
--- /dev/null
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -0,0 +1,368 @@
+// Copyright (c) 2013-2014 Matthias Noack (ma.noack.pr@gmail.com)
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef ham_net_communicator_mpi_hpp
+#define ham_net_communicator_mpi_hpp
+
+#include <mpi.h>
+
+#include <cassert>
+#include <cstring> // memcpy
+#include <stdlib.h> // posix_memalign
+
+#include "ham/misc/constants.hpp"
+#include "ham/misc/resource_pool.hpp"
+#include "ham/misc/types.hpp"
+#include "ham/util/debug.hpp"
+#include "ham/util/log.hpp"
+
+namespace ham {
+namespace net {
+
+template<typename T>
+class buffer_ptr {
+public:
+	buffer_ptr();
+    buffer_ptr(T* ptr, node_t node) : ptr_(ptr), node_(node), mpi_address_(0) { }
+	buffer_ptr(T* ptr, node_t node, MPI_Aint mpi_address) : ptr_(ptr), node_(node), mpi_address_(mpi_address) { }
+
+
+	T* get() { return ptr_; }
+	node_t node() { return node_; }
+    MPI_Aint get_mpi_address() { return mpi_address_; }
+
+    // element access
+	T& operator [] (size_t i);
+
+	// basic pointer arithmetic to address sub-buffers
+	buffer_ptr<T> operator+(size_t off)
+	{
+		return buffer_ptr(ptr_ + off, node_);
+	}
+
+private:
+	T* ptr_;
+	node_t node_;
+    MPI_Aint mpi_address_;
+};
+
+class node_descriptor
+{
+public:
+	//node_descriptor() : name(MPI_MAX_PROCESSOR_NAME, 0) {}
+
+	//const std::string& name() const { return name_; }
+	const char* name() const { return name_; }
+private:
+	//std::string name_; // TODO(improvement): unify node description for all back-ends, NOTE: std::string is not trivally transferable
+	char name_[MPI_MAX_PROCESSOR_NAME + 1];
+
+	friend class net::communicator;
+};
+
+class communicator {
+public:
+	// externally used interface of request must be shared across all communicator-implementations
+	class request {
+	public:
+		request() : valid_(false) {} // instantiate invalid
+		
+		request(node_t target_node, node_t source_node, size_t send_buffer_index, size_t recv_buffer_index)
+		 : target_node(target_node), source_node(source_node), valid_(true), send_buffer_index(send_buffer_index), recv_buffer_index(recv_buffer_index), req_count(0)
+		{}
+
+		// return true if request was finished
+        // will not work as intended for rma ops, no equivalent to test() available for remote completion
+		bool test()
+		{
+			int flag = 0;
+			MPI_Testall(req_count, mpi_reqs, &flag, MPI_STATUS_IGNORE); // just test the receive request, since the send belonging to the request triggers the remote send that is received
+
+            if(uses_rma)
+            {
+                HAM_DEBUG( HAM_LOG << "request::test(), warning: may give false positive on rma remote completion" << std::endl; )
+            }
+
+            return flag != 0;
+		}
+
+		void* get() // blocks
+		{
+			HAM_DEBUG( HAM_LOG << "request::get(), before MPI_Waitall()" << std::endl; )
+			MPI_Waitall(req_count, mpi_reqs, MPI_STATUS_IGNORE); // must wait for all requests to satisfy the standard
+			HAM_DEBUG( HAM_LOG << "request::get(), after MPI_Waitall()" << std::endl; )
+            if(uses_rma)
+            {
+                MPI_Win_unlock(target_node, rma_win);
+            }
+			return static_cast<void*>(&communicator::instance().peers[target_node].msg_buffers[recv_buffer_index]);
+		}
+
+		template<class T>
+		void send_result(T* result_msg, size_t size)
+		{
+			assert(communicator::this_node() == target_node); // this assert fails if send_result is called from the wrong side
+			
+			// TODO(improvement, low priority): better go through communicator, such that no MPI calls are anywhere else
+			MPI_Send(result_msg, size, MPI_BYTE, source_node, constants::RESULT_TAG, MPI_COMM_WORLD);
+			//communicator::instance().send_msg(source_node, source_buffer_index, NO_BUFFER_INDEX, result_msg, size);
+		}
+
+		bool valid() const
+		{
+			return valid_;
+		}
+
+        bool uses_rma() const
+        {
+            return uses_rma_;
+        }
+
+		MPI_Request& next_mpi_request()
+		{
+			HAM_DEBUG( HAM_LOG << "next_mpi_request(): this=" << this << ", req_count=" << req_count << ", NUM_REQUESTS=" << NUM_REQUESTS << std::endl; )
+			assert(req_count < NUM_REQUESTS);
+			return mpi_reqs[req_count++]; // NOTE: post-increment
+		}
+
+		node_t target_node;
+		node_t source_node;
+		bool valid_;
+        bool uses_rma;
+
+		// only needed by the sender
+		enum { NUM_REQUESTS = 3 };
+		
+		size_t send_buffer_index; // buffer to use for sending the message
+		size_t recv_buffer_index; // buffer to use for receiving the result
+		size_t req_count;
+		
+	private:
+		MPI_Request mpi_reqs[NUM_REQUESTS]; // for sending the msg, receiving the result, and an associated data transfer
+	}; // class request
+
+	typedef request& request_reference_type;
+	typedef const request& request_const_reference_type;
+
+	communicator(int argc, char* argv[])
+	{
+		HAM_DEBUG( std::cout << "communicator::communicator(): initialising MPI" << std::endl; )
+
+		instance_ = this;
+		int p;
+		MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &p);
+		if (p != MPI_THREAD_MULTIPLE)
+		{
+			std::cerr << "Could not initialise MPI with MPI_THREAD_MULTIPLE, MPI_Init_thread() returned " << p << std::endl;
+		}
+		HAM_DEBUG( std::cout << "communicator::communicator(): initialising MPI ..." << std::endl; )
+
+		int t;
+		MPI_Comm_rank(MPI_COMM_WORLD, &t);
+		this_node_ = t;
+		MPI_Comm_size(MPI_COMM_WORLD, &t);
+		nodes_ = t;
+		host_node_ = 0; // TODO(improvement): make configureable, like for SCIF
+        MPI_Win_create_dynamic(MPI_INFO_NULL, MPI_COMM_WORLD, &rma_win);
+
+		HAM_DEBUG( std::cout << "communicator::communicator(): initialising MPI done" << std::endl; )
+
+		peers = new mpi_peer[nodes_];
+		
+		// start of node descriptor code:
+		node_descriptions.resize(nodes_);
+		
+		// build own node descriptor
+		node_descriptor node_description;
+		int count;
+		MPI_Get_processor_name(node_description.name_, &count);
+		node_description.name_[count] = 0x0; // null terminate
+
+//		char hostname[MPI_MAX_PROCESSOR_NAME + 1];
+//		MPI_Get_processor_name(hostname, &count);
+//		hostname[count] = 0x0; // null terminate
+//		node_description.name_.assign(hostname, count);
+
+		// append rank for testing:
+		//node_description.name_[count] = 48 + this_node_;
+		//node_description.name_[count+1] = 0x0;
+
+		// communicate descriptors between nodes
+		HAM_DEBUG( HAM_LOG << "communicator::communicator(): gathering node descriptions" << std::endl; )
+		//MPI_Alltoall(&node_description, sizeof(node_descriptor), MPI_BYTE, node_descriptions.data(), sizeof(node_descriptor), MPI_BYTE, MPI_COMM_WORLD);
+		MPI_Allgather(&node_description, sizeof(node_descriptor), MPI_BYTE, node_descriptions.data(), sizeof(node_descriptor), MPI_BYTE, MPI_COMM_WORLD);
+		HAM_DEBUG( HAM_LOG << "communicator::communicator(): gathering node descriptions done" << std::endl; )
+		
+		if (is_host()) {
+			for (node_t i = 1; i < nodes_; ++i) { // TODO(improvement): needs to be changed when host-rank becomes configurable
+				// allocate buffers
+				peers[i].msg_buffers = allocate_buffer<msg_buffer>(constants::MSG_BUFFERS, this_node_);
+				// fill resource pools
+				for(size_t j = constants::MSG_BUFFERS; j > 0; --j) {
+					peers[i].buffer_pool.add(j-1);
+				}
+			}
+		}
+	}
+
+	~communicator()
+	{
+		MPI_Finalize(); // TODO(improvement): check on error and create output if there was one
+		HAM_DEBUG( HAM_LOG << "~communicator" << std::endl; )
+	}
+
+
+	request allocate_request(node_t remote_node)
+	{
+		HAM_DEBUG( HAM_LOG << "communicator::allocate_next_request(): remote_node = " << remote_node << std::endl; )
+
+		const size_t send_buffer_index = peers[remote_node].buffer_pool.allocate();
+		const size_t recv_buffer_index = peers[remote_node].buffer_pool.allocate();
+
+		return { remote_node, this_node_, send_buffer_index, recv_buffer_index };
+	}
+
+	void free_request(request& req)
+	{
+		assert(req.valid());
+		assert(req.source_node == this_node_);
+	
+		mpi_peer& peer = peers[req.target_node];
+
+		peer.buffer_pool.free(req.send_buffer_index);
+		peer.buffer_pool.free(req.recv_buffer_index);
+		req.valid_ = false;
+	}
+
+public:
+	void send_msg(request_reference_type req, void* msg, size_t size)
+	{
+		// copy message from caller into transfer buffer
+		void* msg_buffer = static_cast<void*>(&peers[req.target_node].msg_buffers[req.send_buffer_index]);
+		memcpy(msg_buffer, msg, size);
+		MPI_Isend(msg_buffer, size, MPI_BYTE, req.target_node, constants::DEFAULT_TAG, MPI_COMM_WORLD, &req.next_mpi_request());
+	}
+	
+	// to be used by the offload target's main loop: synchronously receive one message at a time
+	// NOTE: the local static receive buffer!
+	void* recv_msg_host(void* msg = nullptr, size_t size = constants::MSG_SIZE)
+	{
+		static msg_buffer buffer; // NOTE !
+		MPI_Recv(&buffer, size, MPI_BYTE, host_node_, constants::DEFAULT_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+		return static_cast<void*>(&buffer);
+	}
+
+	// trigger receiving the result of a message on the sending side
+	void recv_result(request_reference_type req)
+	{
+		// nothing todo here, since this communicator implementation uses one-sided communication
+		// the data is already where it is expected (in the buffer referenced in req)
+		MPI_Irecv(static_cast<void*>(&peers[req.target_node].msg_buffers[req.recv_buffer_index]), constants::MSG_SIZE, MPI_BYTE, req.target_node, constants::RESULT_TAG, MPI_COMM_WORLD, &req.next_mpi_request());
+		return;
+	}
+
+	template<typename T>
+	void send_data(T* local_source, buffer_ptr<T> remote_dest, size_t size)
+	{
+		//MPI_Send((void*)local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), constants::DATA_TAG, MPI_COMM_WORLD);
+        MPI_Win_lock(MPI_LOCK_EXCLUSIVE, remote_dest.node(), 0, rma_win);
+        MPI_Put(local_source, size, MPI_BYTE, remote_dest.node(), (void *) remote_dest.get_mpi_address(), size, MPI_BYTE, rma_win);
+        MPI_Win_unlock(remote_dest.node(), rma_win);
+	}
+
+	// to be used by the host
+	template<typename T>
+	void send_data_async(request_reference_type req, T* local_source, buffer_ptr<T> remote_dest, size_t size)
+	{
+		//MPI_Isend((void*)local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), constants::DATA_TAG, MPI_COMM_WORLD, &req.next_mpi_request());
+        req.uses_rma = true;
+
+        MPI_Win_lock(MPI_LOCK_EXCLUSIVE, remote_dest.node(), 0, rma_win);
+        MPI_Rput(local_source, size, MPI_BYTE, remote_dest.node(), (void *) remote_dest.get_mpi_address(), size, MPI_BYTE, rma_win, &re.next_mpi_request());
+	}
+
+
+	template<typename T>
+	void recv_data(buffer_ptr<T> remote_source, T* local_dest, size_t size)
+	{
+		//MPI_Recv((void*)local_dest, size * sizeof(T), MPI_BYTE, remote_source.node(), constants::DATA_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+        MPI_Get(remote_source, size, MPI_BYTE, remote_source.node(), (void *) remote_dest.get_mpi_address(), size, MPI_BYTE, rma_win);
+        MPI_Win_flush(remote_source.node(), rma_win);
+	}
+	
+	// to be used by the host
+	template<typename T>
+	void recv_data_async(request_reference_type req, buffer_ptr<T> remote_source, T* local_dest, size_t size)
+	{
+		//MPI_Irecv(static_cast<void*>(local_dest), size * sizeof(T), MPI_BYTE, remote_source.node(), constants::DATA_TAG, MPI_COMM_WORLD, &req.next_mpi_request());
+        req.uses_rma = true;
+        MPI_RGet(remote_source, size, MPI_BYTE, remote_source.node(), (void *) remote_dest.get_mpi_address(), size, MPI_BYTE, rma_win, &req.next_mpi_request());
+	}
+
+	template<typename T>
+	buffer_ptr<T> allocate_buffer(const size_t n, node_t source_node)
+	{
+		T* ptr;
+		//int err =
+		posix_memalign((void**)&ptr, constants::CACHE_LINE_SIZE, n * sizeof(T));
+        MPI_Aint mpi_address;
+        MPI_Win_attach(rma_win, (void *) &mpi_address, n * sizeof(T));
+		// NOTE: no ctor is called
+		return buffer_ptr<T>(ptr, this_node_, mpi_address);
+	}
+
+	template<typename T>
+	void free_buffer(buffer_ptr<T> ptr)
+	{
+		assert(ptr.node() == this_node_);
+		// NOTE: no dtor is called
+        MPI_Win_detach(rma_win, ptr.get());
+		free(static_cast<void*>(ptr.get()));
+	}
+
+	static communicator& instance() { return *instance_; }
+	static node_t this_node() { return instance().this_node_; }
+	static size_t num_nodes() { return instance().nodes_; }
+	bool is_host() { return this_node_ == 0; } // TODO(improvement): ham_address == ham_host_address ; }
+	bool is_host(node_t node) { return node == 0; } // TODO(improvement): node == ham_host_address; }
+
+	static const node_descriptor& get_node_description(node_t node)
+	{
+		return instance().node_descriptions[node];
+	}
+
+private:
+	static communicator* instance_;
+	node_t this_node_;
+	size_t nodes_;
+	node_t host_node_;
+	std::vector<node_descriptor> node_descriptions; // not as member in peer below, because Allgather is used to exchange node descriptions
+    MPI_Win rma_win; // globally shared dynamic window for rma ops
+		
+	struct mpi_peer {
+		buffer_ptr<msg_buffer> msg_buffers; // buffers used for MPI_ISend and IRecv by the sender
+
+		// needed by sender to manage which buffers are in use and which are free
+		// just manages indices, that can be used by
+		detail::resource_pool<size_t> buffer_pool;
+	};
+	
+	mpi_peer* peers;
+};
+
+template<typename T>
+buffer_ptr<T>::buffer_ptr() : buffer_ptr(nullptr, communicator::this_node()) { }
+
+template<typename T>
+T& buffer_ptr<T>::operator[](size_t i)
+{
+	assert(node_ == communicator::this_node());
+	return ptr_[i];
+}
+
+} // namespace net
+} // namespace ham
+
+#endif // ham_net_communicator_mpi_hpp

From 6686a91e381aced0791749f3cf2fbb6984bd7409 Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Thu, 29 Mar 2018 15:05:29 +0200
Subject: [PATCH 013/150] use shared locks for mpi rma windows

---
 include/ham/net/communicator.hpp              |  4 ++-
 .../ham/net/communicator_mpi_rma_dynamic.hpp  |  4 +--
 include/ham/offload/offload.hpp               | 30 +++++++++++++++----
 include/ham/offload/offload_msg.hpp           |  3 +-
 4 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/include/ham/net/communicator.hpp b/include/ham/net/communicator.hpp
index 52fe25b..4e84e2b 100644
--- a/include/ham/net/communicator.hpp
+++ b/include/ham/net/communicator.hpp
@@ -49,8 +49,10 @@ namespace net {
 #elif defined HAM_COMM_SCIF
 #define HAM_COMM_ONE_SIDED
 #include "ham/net/communicator_scif.hpp"
+#elif defined HAM_COMM_MPI_RMA_DYNAMIC
+#include "ham/net/communicator_scif.hpp"
 #else
-static_assert(false, "Please define either HAM_COMM_MPI, or HAM_COMM_SCIF.");
+static_assert(false, "Please define either HAM_COMM_MPI, HAM_COMM_MPI_RMA_DYNAMIC or HAM_COMM_SCIF.");
 #endif
 
 #endif // ham_net_communicator_hpp
diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index d1b1add..ebe7a10 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -267,7 +267,7 @@ class communicator {
 	void send_data(T* local_source, buffer_ptr<T> remote_dest, size_t size)
 	{
 		//MPI_Send((void*)local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), constants::DATA_TAG, MPI_COMM_WORLD);
-        MPI_Win_lock(MPI_LOCK_EXCLUSIVE, remote_dest.node(), 0, rma_win);
+        MPI_Win_lock(MPI_LOCK_SHARED, remote_dest.node(), 0, rma_win);
         MPI_Put(local_source, size, MPI_BYTE, remote_dest.node(), (void *) remote_dest.get_mpi_address(), size, MPI_BYTE, rma_win);
         MPI_Win_unlock(remote_dest.node(), rma_win);
 	}
@@ -279,7 +279,7 @@ class communicator {
 		//MPI_Isend((void*)local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), constants::DATA_TAG, MPI_COMM_WORLD, &req.next_mpi_request());
         req.uses_rma = true;
 
-        MPI_Win_lock(MPI_LOCK_EXCLUSIVE, remote_dest.node(), 0, rma_win);
+        MPI_Win_lock(MPI_LOCK_SHARED, remote_dest.node(), 0, rma_win);
         MPI_Rput(local_source, size, MPI_BYTE, remote_dest.node(), (void *) remote_dest.get_mpi_address(), size, MPI_BYTE, rma_win, &re.next_mpi_request());
 	}
 
diff --git a/include/ham/offload/offload.hpp b/include/ham/offload/offload.hpp
index a3cff70..ff41fd7 100644
--- a/include/ham/offload/offload.hpp
+++ b/include/ham/offload/offload.hpp
@@ -239,6 +239,7 @@ future<void> put(T* local_source, buffer_ptr<T>& remote_dest, size_t n)
     future<void> result(comm.allocate_request(remote_dest.node()));
 	HAM_DEBUG( HAM_LOG << "offload::put(): initiating RMA put..." << std::endl; )
 	comm.send_data_async(result.get_request(), local_source, remote_dest, n);
+    return result;
 #endif
 }
 
@@ -339,13 +340,30 @@ void copy_sync(buffer_ptr<T> source, buffer_ptr<T> dest, size_t n)
 	write_result.get();
 #endif
 #ifdef SOME_COOL_VAR_FOR_MPI_RMA_DYN // compile-integration pending
-	future<void> result(comm.allocate_request(source.node()));
-	HAM_DEBUG( HAM_LOG << "offload::copy_sync(): initiating copy between " << source.node() << " and " << dest.node() << std::endl; )
-	SEND READ_MSG to source (maybe introduce new copy_msg)
-	MAKE SURE there is no winlock on dest from host
-
-	comm.send_data_async(result.get_request(), local_source, remote_dest, n);
+    // use async copy + sync
+    copy(source, dest, n).get();
+#endif
+}
 #endif
+
+
+#ifdef SOME_COOL_VAR_FOR_MPI_RMA_DYN // compile-integration pending
+template<typename T>
+future<void> copy(buffer_ptr<T> source, buffer_ptr<T> dest, size_t n)
+{
+    net::communicator& comm = runtime::instance().communicator();
+
+    // make sure there is no winlock on dest from host
+    // solution: shared lock
+
+    // issues a put on the source node targeting the destination node
+    future<void> result(comm.allocate_request(source.node()));
+    HAM_DEBUG( HAM_LOG << "offload::copy_sync(): initiating copy between " << source.node() << " and " << dest.node() << std::endl; )
+        auto copy_msg = detail::offload_rma_copy_msg<T>(result.get_request(), dest.node(), dest.get_mpi_address(), source.get(), n);
+        comm.send_msg(result.get_request(), (void*)&copy_msg, sizeof write_msg);
+        comm.recv_result(result.get_request());
+
+        return result;
 }
 #endif
 
diff --git a/include/ham/offload/offload_msg.hpp b/include/ham/offload/offload_msg.hpp
index 845dd08..b16a8a0 100644
--- a/include/ham/offload/offload_msg.hpp
+++ b/include/ham/offload/offload_msg.hpp
@@ -142,7 +142,7 @@ class offload_read_msg
             : public active_msg<offload_rma_copy_msg<T, ExecutionPolicy>, ExecutionPolicy>
     {
     public:
-        offload_rma_copy_msg(communicator::request req, node_t remote_node, MPI_Aint remote_addr,T* local_source, size_t n)
+        offload_rma_copy_msg(communicator::request req, node_t remote_node, MPI_Aint remote_addr, T* local_source, size_t n)
                 : req(req), remote_node(remote_node), remote_addr(remote_addr), local_source(local_source), n(n) { }
 
         void operator()() //const
@@ -163,6 +163,7 @@ class offload_read_msg
         size_t n;
     };
 //#endif
+
 } // namespace detail
 } // namespace offload
 } // namespace ham

From e684eb43da8e96d3eee7a9a14230cd44e22ce00b Mon Sep 17 00:00:00 2001
From: Deppisch <bzcdeppi@pvs-pc06.zib.de>
Date: Thu, 29 Mar 2018 15:05:29 +0200
Subject: [PATCH 014/150] use shared locks for mpi rma windows

---
 include/ham/net/communicator.hpp              |  4 ++-
 .../ham/net/communicator_mpi_rma_dynamic.hpp  |  4 +--
 include/ham/offload/offload.hpp               | 30 +++++++++++++++----
 include/ham/offload/offload_msg.hpp           |  3 +-
 4 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/include/ham/net/communicator.hpp b/include/ham/net/communicator.hpp
index 52fe25b..4e84e2b 100644
--- a/include/ham/net/communicator.hpp
+++ b/include/ham/net/communicator.hpp
@@ -49,8 +49,10 @@ namespace net {
 #elif defined HAM_COMM_SCIF
 #define HAM_COMM_ONE_SIDED
 #include "ham/net/communicator_scif.hpp"
+#elif defined HAM_COMM_MPI_RMA_DYNAMIC
+#include "ham/net/communicator_scif.hpp"
 #else
-static_assert(false, "Please define either HAM_COMM_MPI, or HAM_COMM_SCIF.");
+static_assert(false, "Please define either HAM_COMM_MPI, HAM_COMM_MPI_RMA_DYNAMIC or HAM_COMM_SCIF.");
 #endif
 
 #endif // ham_net_communicator_hpp
diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index d1b1add..ebe7a10 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -267,7 +267,7 @@ class communicator {
 	void send_data(T* local_source, buffer_ptr<T> remote_dest, size_t size)
 	{
 		//MPI_Send((void*)local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), constants::DATA_TAG, MPI_COMM_WORLD);
-        MPI_Win_lock(MPI_LOCK_EXCLUSIVE, remote_dest.node(), 0, rma_win);
+        MPI_Win_lock(MPI_LOCK_SHARED, remote_dest.node(), 0, rma_win);
         MPI_Put(local_source, size, MPI_BYTE, remote_dest.node(), (void *) remote_dest.get_mpi_address(), size, MPI_BYTE, rma_win);
         MPI_Win_unlock(remote_dest.node(), rma_win);
 	}
@@ -279,7 +279,7 @@ class communicator {
 		//MPI_Isend((void*)local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), constants::DATA_TAG, MPI_COMM_WORLD, &req.next_mpi_request());
         req.uses_rma = true;
 
-        MPI_Win_lock(MPI_LOCK_EXCLUSIVE, remote_dest.node(), 0, rma_win);
+        MPI_Win_lock(MPI_LOCK_SHARED, remote_dest.node(), 0, rma_win);
         MPI_Rput(local_source, size, MPI_BYTE, remote_dest.node(), (void *) remote_dest.get_mpi_address(), size, MPI_BYTE, rma_win, &re.next_mpi_request());
 	}
 
diff --git a/include/ham/offload/offload.hpp b/include/ham/offload/offload.hpp
index a3cff70..ff41fd7 100644
--- a/include/ham/offload/offload.hpp
+++ b/include/ham/offload/offload.hpp
@@ -239,6 +239,7 @@ future<void> put(T* local_source, buffer_ptr<T>& remote_dest, size_t n)
     future<void> result(comm.allocate_request(remote_dest.node()));
 	HAM_DEBUG( HAM_LOG << "offload::put(): initiating RMA put..." << std::endl; )
 	comm.send_data_async(result.get_request(), local_source, remote_dest, n);
+    return result;
 #endif
 }
 
@@ -339,13 +340,30 @@ void copy_sync(buffer_ptr<T> source, buffer_ptr<T> dest, size_t n)
 	write_result.get();
 #endif
 #ifdef SOME_COOL_VAR_FOR_MPI_RMA_DYN // compile-integration pending
-	future<void> result(comm.allocate_request(source.node()));
-	HAM_DEBUG( HAM_LOG << "offload::copy_sync(): initiating copy between " << source.node() << " and " << dest.node() << std::endl; )
-	SEND READ_MSG to source (maybe introduce new copy_msg)
-	MAKE SURE there is no winlock on dest from host
-
-	comm.send_data_async(result.get_request(), local_source, remote_dest, n);
+    // use async copy + sync
+    copy(source, dest, n).get();
+#endif
+}
 #endif
+
+
+#ifdef SOME_COOL_VAR_FOR_MPI_RMA_DYN // compile-integration pending
+template<typename T>
+future<void> copy(buffer_ptr<T> source, buffer_ptr<T> dest, size_t n)
+{
+    net::communicator& comm = runtime::instance().communicator();
+
+    // make sure there is no winlock on dest from host
+    // solution: shared lock
+
+    // issues a put on the source node targeting the destination node
+    future<void> result(comm.allocate_request(source.node()));
+    HAM_DEBUG( HAM_LOG << "offload::copy_sync(): initiating copy between " << source.node() << " and " << dest.node() << std::endl; )
+        auto copy_msg = detail::offload_rma_copy_msg<T>(result.get_request(), dest.node(), dest.get_mpi_address(), source.get(), n);
+        comm.send_msg(result.get_request(), (void*)&copy_msg, sizeof write_msg);
+        comm.recv_result(result.get_request());
+
+        return result;
 }
 #endif
 
diff --git a/include/ham/offload/offload_msg.hpp b/include/ham/offload/offload_msg.hpp
index 845dd08..b16a8a0 100644
--- a/include/ham/offload/offload_msg.hpp
+++ b/include/ham/offload/offload_msg.hpp
@@ -142,7 +142,7 @@ class offload_read_msg
             : public active_msg<offload_rma_copy_msg<T, ExecutionPolicy>, ExecutionPolicy>
     {
     public:
-        offload_rma_copy_msg(communicator::request req, node_t remote_node, MPI_Aint remote_addr,T* local_source, size_t n)
+        offload_rma_copy_msg(communicator::request req, node_t remote_node, MPI_Aint remote_addr, T* local_source, size_t n)
                 : req(req), remote_node(remote_node), remote_addr(remote_addr), local_source(local_source), n(n) { }
 
         void operator()() //const
@@ -163,6 +163,7 @@ class offload_read_msg
         size_t n;
     };
 //#endif
+
 } // namespace detail
 } // namespace offload
 } // namespace ham

From f6c6925f761e0f9cc3374ff0bb765d603deba8c8 Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Thu, 29 Mar 2018 16:00:14 +0200
Subject: [PATCH 015/150] compile integration of rma dynamic backend

---
 Jamroot                                      | 48 ++++++++++++++++++--
 include/ham/offload/offload.hpp              | 27 +++++------
 src/ham/net/communicator_mpi_rma_dynamic.cpp |  9 ++++
 3 files changed, 64 insertions(+), 20 deletions(-)
 create mode 100644 src/ham/net/communicator_mpi_rma_dynamic.cpp

diff --git a/Jamroot b/Jamroot
index 1d27769..0e06729 100644
--- a/Jamroot
+++ b/Jamroot
@@ -56,6 +56,13 @@ obj offload_obj_mpi : ham/offload/offload.cpp : <library>/mpi//mpi <define>HAM_C
 
 constant OBJ_FILES_MPI : communicator_obj_mpi runtime_obj_mpi offload_obj_mpi communicator_mpi_obj_mpi ;
 
+obj communicator_obj_mpi_rma_dyn : ham/net/communicator.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC ;
+obj communicator_mpi_rma_dyn_obj_mpi_rma_dyn : ham/net/communicator_mpi_rma_dynamic.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC ;
+obj runtime_obj_mpi_rma_dyn : ham/offload/runtime.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC ;
+obj offload_obj_mpi_rma_dyn : ham/offload/offload.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC ;
+
+constant OBJ_FILES_MPI_RMA_DYN : communicator_obj_mpi_rma_dyn communicator_mpi_rma_dyn_obj_mpi_rma_dyn runtime_obj_mpi_rma_dyn offload_obj_mpi_rma_dyn ;
+
 obj communicator_obj_scif : ham/net/communicator.cpp : <library>scif <define>HAM_COMM_SCIF ;
 obj communicator_scif_obj_scif : ham/net/communicator_scif.cpp : <library>scif <define>HAM_COMM_SCIF ;
 obj runtime_obj_scif : ham/offload/runtime.cpp : <library>scif <define>HAM_COMM_SCIF ;
@@ -66,6 +73,7 @@ constant OBJ_FILES_SCIF : communicator_obj_scif runtime_obj_scif offload_obj_sci
 # Libraries
 
 obj main_obj_mpi : ham/offload/main.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI ;
+obj main_obj_mpi_rma_dyn : ham/offload/main.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC ;
 obj main_obj_scif : ham/offload/main.cpp : <library>scif <define>HAM_COMM_SCIF ;
 
 lib ham_offload_mpi
@@ -73,12 +81,18 @@ lib ham_offload_mpi
 	: <library>/mpi//mpi <define>HAM_COMM_MPI
 	;
 
+lib ham_offload_mpi_rma_dyn
+    : $(OBJ_FILES_COMMON) $(OBJ_FILES_MPI_RMA_DYN) main_obj_mpi_rma_dyn boost_program_options
+    : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC
+    ;
+
 lib ham_offload_scif
 	: $(OBJ_FILES_COMMON) $(OBJ_FILES_SCIF) main_obj_scif boost_program_options
 	: <library>scif <define>HAM_COMM_SCIF
 	;
 
 obj main_explicit_obj_mpi : ham/offload/main_explicit.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI ;
+obj main_explicit_obj_mpi_rma_dyn : ham/offload/main_explicit.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC ;
 obj main_explicit_obj_scif : ham/offload/main_explicit.cpp : <library>scif <define>HAM_COMM_SCIF ;
 
 lib ham_offload_mpi_explicit
@@ -86,6 +100,10 @@ lib ham_offload_mpi_explicit
 	: <library>/mpi//mpi <define>HAM_COMM_MPI <define>HAM_EXPLICIT
 	;
 
+lib ham_offload_mpi_rma_dyn_explicit
+    : $(OBJ_FILES_COMMON) $(OBJ_FILES_MPI_RMA_DYN) main_explicit_obj_mpi_rma_dyn boost_program_options
+    : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC <define>HAM_EXPLICIT
+
 lib ham_offload_scif_explicit
 	: $(OBJ_FILES_COMMON) $(OBJ_FILES_SCIF) main_explicit_obj_scif boost_program_options
 	: <library>scif <define>HAM_COMM_SCIF <define>HAM_EXPLICIT
@@ -99,6 +117,12 @@ exe benchmark_ham_offload_mpi
 	: <library>/mpi//mpi <library>ham_offload_mpi
 	;	
 
+obj benchmark_ham_offload_mpi_rma_dyn_obj : benchmark_ham_offload.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC ;
+exe benchmark_ham_offload_mpi_rma_dyn
+    : benchmark_ham_offload_mpi_rma_dyn_obj boost_program_options
+    : <library>/mpi//mpi <library>ham_offload_mpi_rma_dyn
+    ;
+
 obj benchmark_ham_offload_scif_obj : benchmark_ham_offload.cpp : <library>scif <define>HAM_COMM_SCIF ;
 exe benchmark_ham_offload_scif
 	: benchmark_ham_offload_scif_obj boost_program_options ham_offload_scif
@@ -121,14 +145,16 @@ exe active_msgs_over_file
 	;
 
 exe ham_offload
-	: ham_offload.cpp ham_offload_mpi boost_program_options
-	: <library>/mpi//mpi <define>HAM_COMM_MPI
+	: ham_offload.cpp ham_offload_mpi_rma_dyn boost_program_options
+#	: <library>/mpi//mpi <define>HAM_COMM_MPI
+	: <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC
 #	: <library>scif <define>HAM_COMM_SCIF
 	;
 
 exe ham_offload_explicit
-	: ham_offload_explicit.cpp ham_offload_mpi_explicit boost_program_options
-	: <library>/mpi//mpi <define>HAM_COMM_MPI
+	: ham_offload_explicit.cpp ham_offload_mpi_rma_dyn_explicit boost_program_options
+#	: <library>/mpi//mpi <define>HAM_COMM_MPI
+	: <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC
 #	: <library>scif <define>HAM_COMM_SCIF
 	;
 
@@ -142,7 +168,11 @@ exe inner_product_mpi
 	: <library>/mpi//mpi <define>HAM_COMM_MPI
 	;
 
-#
+exe inner_product_mpi_rma_dynamic
+    : [ obj inner_product_obj : inner_product.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI ] ham_offload_mpi_rma_dyn boost_program_options
+    : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC
+    ;
+
 exe test_data_transfer_scif
 	: [ obj test_data_transfer_obj : test_data_transfer.cpp : <library>scif <define>HAM_COMM_SCIF ] ham_offload_scif boost_program_options
 	: <library>scif <define>HAM_COMM_SCIF
@@ -153,6 +183,10 @@ exe test_data_transfer_mpi
 	: <library>/mpi//mpi <define>HAM_COMM_MPI
 	;
 
+exe test_data_transfer_mpi_rma_dynamic
+	: [ obj test_data_transfer_obj : test_data_transfer.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC ] ham_offload_mpi_rma_dyn boost_program_options
+	: <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC
+	;
 
 exe test_argument_transfer_scif
 	: [ obj test_argument_transfer_obj : test_argument_transfer.cpp : <library>scif <define>HAM_COMM_SCIF ] ham_offload_scif boost_program_options
@@ -164,6 +198,10 @@ exe test_argument_transfer_mpi
 	: <library>/mpi//mpi <define>HAM_COMM_MPI
 	;
 
+exe test_argument_transfer_mpi_rma_dynamic
+	: [ obj test_argument_transfer_obj : test_argument_transfer.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC ] ham_offload_mpi_rma_dyn boost_program_options
+	: <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC
+	;
 
 # Explicit targets (not built by default)
 explicit benchmark_intel_leo ;
diff --git a/include/ham/offload/offload.hpp b/include/ham/offload/offload.hpp
index ff41fd7..f721597 100644
--- a/include/ham/offload/offload.hpp
+++ b/include/ham/offload/offload.hpp
@@ -223,7 +223,7 @@ future<void> put(T* local_source, buffer_ptr<T>& remote_dest, size_t n)
 	// TODO(improvement): create a data transfer thread for one-sided
 	comm.send_data(local_source, remote_dest, n); // sync
 	return future<void>(true); // return dummy future
-#else
+#elif defined HAM_COMM_MPI
 	// allocate a request and construct a future
 	future<void> result(comm.allocate_request(remote_dest.node()));
 	// generate an offload message
@@ -234,8 +234,7 @@ future<void> put(T* local_source, buffer_ptr<T>& remote_dest, size_t n)
 	comm.recv_result(result.get_request()); // trigger receiving the msgs result // async
 	
 	return result;
-#endif
-#ifdef SOME_COOL_VAR_FOR_MPI_RMA_DYN // compile-time integration pending
+#elif HAM_COMM_MPI_RMA_DYNAMIC
     future<void> result(comm.allocate_request(remote_dest.node()));
 	HAM_DEBUG( HAM_LOG << "offload::put(): initiating RMA put..." << std::endl; )
 	comm.send_data_async(result.get_request(), local_source, remote_dest, n);
@@ -262,7 +261,7 @@ future<void> get(buffer_ptr<T> remote_source, T* local_dest, size_t n)
 	// TODO(improvement): create a data transfer thread for one-sided
 	comm.recv_data(remote_source, local_dest, n); // sync
 	return future<void>(true); // return dummy future
-#else
+#elif defined HAM_COMM_MPI
 	// allocate a request and construct a future
 	future<void> result(comm.allocate_request(remote_source.node()));
 	// generate an offload message
@@ -273,8 +272,7 @@ future<void> get(buffer_ptr<T> remote_source, T* local_dest, size_t n)
 	comm.recv_result(result.get_request()); // trigger receiving the result
 
 	return result;
-#endif
-#ifdef SOME_COOL_VAR_FOR_MPI_RMA_DYN // compile-time integration pending
+#elif defined HAM_COMM_MPI_RMA_DYNAMIC
 	future<void> result(comm.allocate_request(remote_dest.node()));
 	HAM_DEBUG( HAM_LOG << "offload::put(): initiating RMA get..." << std::endl; )
 	comm.recv_data_async(result.get_request(), remote_source, local_dest, n);
@@ -320,7 +318,7 @@ void copy_sync(buffer_ptr<T> source, buffer_ptr<T> dest, size_t n)
 // fix 1st arg:
 //	comm.send_data(src_node, local_source, remote_dest, n);
 //	static_assert(false, "copy is not implemented yet for the SCIF back-end");
-#else
+#elif defined HAM_COMM_MPI
 	// send corresponding write and read messages to the sender and the receiver
 
 	// issues a send operation on the source node, that sends the memory at source to the destination node
@@ -338,8 +336,7 @@ void copy_sync(buffer_ptr<T> source, buffer_ptr<T> dest, size_t n)
 	// synchronise
 	read_result.get();
 	write_result.get();
-#endif
-#ifdef SOME_COOL_VAR_FOR_MPI_RMA_DYN // compile-integration pending
+#elif defined HAM_COMM_MPI_RMA_DYNAMIC
     // use async copy + sync
     copy(source, dest, n).get();
 #endif
@@ -347,23 +344,23 @@ void copy_sync(buffer_ptr<T> source, buffer_ptr<T> dest, size_t n)
 #endif
 
 
-#ifdef SOME_COOL_VAR_FOR_MPI_RMA_DYN // compile-integration pending
+#ifdef HAM_COMM_MPI_RMA_DYNAMIC // compile-integration pending
 template<typename T>
 future<void> copy(buffer_ptr<T> source, buffer_ptr<T> dest, size_t n)
 {
     net::communicator& comm = runtime::instance().communicator();
 
     // make sure there is no winlock on dest from host
-    // solution: shared lock
+    // solution: shared lock, unlocking from host not necessary
 
     // issues a put on the source node targeting the destination node
     future<void> result(comm.allocate_request(source.node()));
     HAM_DEBUG( HAM_LOG << "offload::copy_sync(): initiating copy between " << source.node() << " and " << dest.node() << std::endl; )
-        auto copy_msg = detail::offload_rma_copy_msg<T>(result.get_request(), dest.node(), dest.get_mpi_address(), source.get(), n);
-        comm.send_msg(result.get_request(), (void*)&copy_msg, sizeof write_msg);
-        comm.recv_result(result.get_request());
+    auto copy_msg = detail::offload_rma_copy_msg<T>(result.get_request(), dest.node(), dest.get_mpi_address(), source.get(), n);
+    comm.send_msg(result.get_request(), (void*)&copy_msg, sizeof write_msg);
+    comm.recv_result(result.get_request());
 
-        return result;
+    return result;
 }
 #endif
 
diff --git a/src/ham/net/communicator_mpi_rma_dynamic.cpp b/src/ham/net/communicator_mpi_rma_dynamic.cpp
new file mode 100644
index 0000000..e4e5dbd
--- /dev/null
+++ b/src/ham/net/communicator_mpi_rma_dynamic.cpp
@@ -0,0 +1,9 @@
+// Copyright (c) 2013-2014 Matthias Noack (ma.noack.pr@gmail.com)
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include "ham/net/communicator.hpp"
+
+ham::net::communicator* ham::net::communicator::instance_ = nullptr;
+

From 6c15d13655d8526e11a835b0d5d1d70aaa0f6b00 Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Thu, 29 Mar 2018 16:00:14 +0200
Subject: [PATCH 016/150] compile integration of rma dynamic backend

---
 Jamroot                                      | 48 ++++++++++++++++++--
 include/ham/offload/offload.hpp              | 27 +++++------
 src/ham/net/communicator_mpi_rma_dynamic.cpp |  9 ++++
 3 files changed, 64 insertions(+), 20 deletions(-)
 create mode 100644 src/ham/net/communicator_mpi_rma_dynamic.cpp

diff --git a/Jamroot b/Jamroot
index 1d27769..0e06729 100644
--- a/Jamroot
+++ b/Jamroot
@@ -56,6 +56,13 @@ obj offload_obj_mpi : ham/offload/offload.cpp : <library>/mpi//mpi <define>HAM_C
 
 constant OBJ_FILES_MPI : communicator_obj_mpi runtime_obj_mpi offload_obj_mpi communicator_mpi_obj_mpi ;
 
+obj communicator_obj_mpi_rma_dyn : ham/net/communicator.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC ;
+obj communicator_mpi_rma_dyn_obj_mpi_rma_dyn : ham/net/communicator_mpi_rma_dynamic.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC ;
+obj runtime_obj_mpi_rma_dyn : ham/offload/runtime.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC ;
+obj offload_obj_mpi_rma_dyn : ham/offload/offload.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC ;
+
+constant OBJ_FILES_MPI_RMA_DYN : communicator_obj_mpi_rma_dyn communicator_mpi_rma_dyn_obj_mpi_rma_dyn runtime_obj_mpi_rma_dyn offload_obj_mpi_rma_dyn ;
+
 obj communicator_obj_scif : ham/net/communicator.cpp : <library>scif <define>HAM_COMM_SCIF ;
 obj communicator_scif_obj_scif : ham/net/communicator_scif.cpp : <library>scif <define>HAM_COMM_SCIF ;
 obj runtime_obj_scif : ham/offload/runtime.cpp : <library>scif <define>HAM_COMM_SCIF ;
@@ -66,6 +73,7 @@ constant OBJ_FILES_SCIF : communicator_obj_scif runtime_obj_scif offload_obj_sci
 # Libraries
 
 obj main_obj_mpi : ham/offload/main.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI ;
+obj main_obj_mpi_rma_dyn : ham/offload/main.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC ;
 obj main_obj_scif : ham/offload/main.cpp : <library>scif <define>HAM_COMM_SCIF ;
 
 lib ham_offload_mpi
@@ -73,12 +81,18 @@ lib ham_offload_mpi
 	: <library>/mpi//mpi <define>HAM_COMM_MPI
 	;
 
+lib ham_offload_mpi_rma_dyn
+    : $(OBJ_FILES_COMMON) $(OBJ_FILES_MPI_RMA_DYN) main_obj_mpi_rma_dyn boost_program_options
+    : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC
+    ;
+
 lib ham_offload_scif
 	: $(OBJ_FILES_COMMON) $(OBJ_FILES_SCIF) main_obj_scif boost_program_options
 	: <library>scif <define>HAM_COMM_SCIF
 	;
 
 obj main_explicit_obj_mpi : ham/offload/main_explicit.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI ;
+obj main_explicit_obj_mpi_rma_dyn : ham/offload/main_explicit.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC ;
 obj main_explicit_obj_scif : ham/offload/main_explicit.cpp : <library>scif <define>HAM_COMM_SCIF ;
 
 lib ham_offload_mpi_explicit
@@ -86,6 +100,10 @@ lib ham_offload_mpi_explicit
 	: <library>/mpi//mpi <define>HAM_COMM_MPI <define>HAM_EXPLICIT
 	;
 
+lib ham_offload_mpi_rma_dyn_explicit
+    : $(OBJ_FILES_COMMON) $(OBJ_FILES_MPI_RMA_DYN) main_explicit_obj_mpi_rma_dyn boost_program_options
+    : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC <define>HAM_EXPLICIT
+
 lib ham_offload_scif_explicit
 	: $(OBJ_FILES_COMMON) $(OBJ_FILES_SCIF) main_explicit_obj_scif boost_program_options
 	: <library>scif <define>HAM_COMM_SCIF <define>HAM_EXPLICIT
@@ -99,6 +117,12 @@ exe benchmark_ham_offload_mpi
 	: <library>/mpi//mpi <library>ham_offload_mpi
 	;	
 
+obj benchmark_ham_offload_mpi_rma_dyn_obj : benchmark_ham_offload.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC ;
+exe benchmark_ham_offload_mpi_rma_dyn
+    : benchmark_ham_offload_mpi_rma_dyn_obj boost_program_options
+    : <library>/mpi//mpi <library>ham_offload_mpi_rma_dyn
+    ;
+
 obj benchmark_ham_offload_scif_obj : benchmark_ham_offload.cpp : <library>scif <define>HAM_COMM_SCIF ;
 exe benchmark_ham_offload_scif
 	: benchmark_ham_offload_scif_obj boost_program_options ham_offload_scif
@@ -121,14 +145,16 @@ exe active_msgs_over_file
 	;
 
 exe ham_offload
-	: ham_offload.cpp ham_offload_mpi boost_program_options
-	: <library>/mpi//mpi <define>HAM_COMM_MPI
+	: ham_offload.cpp ham_offload_mpi_rma_dyn boost_program_options
+#	: <library>/mpi//mpi <define>HAM_COMM_MPI
+	: <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC
 #	: <library>scif <define>HAM_COMM_SCIF
 	;
 
 exe ham_offload_explicit
-	: ham_offload_explicit.cpp ham_offload_mpi_explicit boost_program_options
-	: <library>/mpi//mpi <define>HAM_COMM_MPI
+	: ham_offload_explicit.cpp ham_offload_mpi_rma_dyn_explicit boost_program_options
+#	: <library>/mpi//mpi <define>HAM_COMM_MPI
+	: <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC
 #	: <library>scif <define>HAM_COMM_SCIF
 	;
 
@@ -142,7 +168,11 @@ exe inner_product_mpi
 	: <library>/mpi//mpi <define>HAM_COMM_MPI
 	;
 
-#
+exe inner_product_mpi_rma_dynamic
+    : [ obj inner_product_obj : inner_product.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI ] ham_offload_mpi_rma_dyn boost_program_options
+    : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC
+    ;
+
 exe test_data_transfer_scif
 	: [ obj test_data_transfer_obj : test_data_transfer.cpp : <library>scif <define>HAM_COMM_SCIF ] ham_offload_scif boost_program_options
 	: <library>scif <define>HAM_COMM_SCIF
@@ -153,6 +183,10 @@ exe test_data_transfer_mpi
 	: <library>/mpi//mpi <define>HAM_COMM_MPI
 	;
 
+exe test_data_transfer_mpi_rma_dynamic
+	: [ obj test_data_transfer_obj : test_data_transfer.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC ] ham_offload_mpi_rma_dyn boost_program_options
+	: <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC
+	;
 
 exe test_argument_transfer_scif
 	: [ obj test_argument_transfer_obj : test_argument_transfer.cpp : <library>scif <define>HAM_COMM_SCIF ] ham_offload_scif boost_program_options
@@ -164,6 +198,10 @@ exe test_argument_transfer_mpi
 	: <library>/mpi//mpi <define>HAM_COMM_MPI
 	;
 
+exe test_argument_transfer_mpi_rma_dynamic
+	: [ obj test_argument_transfer_obj : test_argument_transfer.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC ] ham_offload_mpi_rma_dyn boost_program_options
+	: <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC
+	;
 
 # Explicit targets (not built by default)
 explicit benchmark_intel_leo ;
diff --git a/include/ham/offload/offload.hpp b/include/ham/offload/offload.hpp
index ff41fd7..f721597 100644
--- a/include/ham/offload/offload.hpp
+++ b/include/ham/offload/offload.hpp
@@ -223,7 +223,7 @@ future<void> put(T* local_source, buffer_ptr<T>& remote_dest, size_t n)
 	// TODO(improvement): create a data transfer thread for one-sided
 	comm.send_data(local_source, remote_dest, n); // sync
 	return future<void>(true); // return dummy future
-#else
+#elif defined HAM_COMM_MPI
 	// allocate a request and construct a future
 	future<void> result(comm.allocate_request(remote_dest.node()));
 	// generate an offload message
@@ -234,8 +234,7 @@ future<void> put(T* local_source, buffer_ptr<T>& remote_dest, size_t n)
 	comm.recv_result(result.get_request()); // trigger receiving the msgs result // async
 	
 	return result;
-#endif
-#ifdef SOME_COOL_VAR_FOR_MPI_RMA_DYN // compile-time integration pending
+#elif HAM_COMM_MPI_RMA_DYNAMIC
     future<void> result(comm.allocate_request(remote_dest.node()));
 	HAM_DEBUG( HAM_LOG << "offload::put(): initiating RMA put..." << std::endl; )
 	comm.send_data_async(result.get_request(), local_source, remote_dest, n);
@@ -262,7 +261,7 @@ future<void> get(buffer_ptr<T> remote_source, T* local_dest, size_t n)
 	// TODO(improvement): create a data transfer thread for one-sided
 	comm.recv_data(remote_source, local_dest, n); // sync
 	return future<void>(true); // return dummy future
-#else
+#elif defined HAM_COMM_MPI
 	// allocate a request and construct a future
 	future<void> result(comm.allocate_request(remote_source.node()));
 	// generate an offload message
@@ -273,8 +272,7 @@ future<void> get(buffer_ptr<T> remote_source, T* local_dest, size_t n)
 	comm.recv_result(result.get_request()); // trigger receiving the result
 
 	return result;
-#endif
-#ifdef SOME_COOL_VAR_FOR_MPI_RMA_DYN // compile-time integration pending
+#elif defined HAM_COMM_MPI_RMA_DYNAMIC
 	future<void> result(comm.allocate_request(remote_dest.node()));
 	HAM_DEBUG( HAM_LOG << "offload::put(): initiating RMA get..." << std::endl; )
 	comm.recv_data_async(result.get_request(), remote_source, local_dest, n);
@@ -320,7 +318,7 @@ void copy_sync(buffer_ptr<T> source, buffer_ptr<T> dest, size_t n)
 // fix 1st arg:
 //	comm.send_data(src_node, local_source, remote_dest, n);
 //	static_assert(false, "copy is not implemented yet for the SCIF back-end");
-#else
+#elif defined HAM_COMM_MPI
 	// send corresponding write and read messages to the sender and the receiver
 
 	// issues a send operation on the source node, that sends the memory at source to the destination node
@@ -338,8 +336,7 @@ void copy_sync(buffer_ptr<T> source, buffer_ptr<T> dest, size_t n)
 	// synchronise
 	read_result.get();
 	write_result.get();
-#endif
-#ifdef SOME_COOL_VAR_FOR_MPI_RMA_DYN // compile-integration pending
+#elif defined HAM_COMM_MPI_RMA_DYNAMIC
     // use async copy + sync
     copy(source, dest, n).get();
 #endif
@@ -347,23 +344,23 @@ void copy_sync(buffer_ptr<T> source, buffer_ptr<T> dest, size_t n)
 #endif
 
 
-#ifdef SOME_COOL_VAR_FOR_MPI_RMA_DYN // compile-integration pending
+#ifdef HAM_COMM_MPI_RMA_DYNAMIC // compile-integration pending
 template<typename T>
 future<void> copy(buffer_ptr<T> source, buffer_ptr<T> dest, size_t n)
 {
     net::communicator& comm = runtime::instance().communicator();
 
     // make sure there is no winlock on dest from host
-    // solution: shared lock
+    // solution: shared lock, unlocking from host not necessary
 
     // issues a put on the source node targeting the destination node
     future<void> result(comm.allocate_request(source.node()));
     HAM_DEBUG( HAM_LOG << "offload::copy_sync(): initiating copy between " << source.node() << " and " << dest.node() << std::endl; )
-        auto copy_msg = detail::offload_rma_copy_msg<T>(result.get_request(), dest.node(), dest.get_mpi_address(), source.get(), n);
-        comm.send_msg(result.get_request(), (void*)&copy_msg, sizeof write_msg);
-        comm.recv_result(result.get_request());
+    auto copy_msg = detail::offload_rma_copy_msg<T>(result.get_request(), dest.node(), dest.get_mpi_address(), source.get(), n);
+    comm.send_msg(result.get_request(), (void*)&copy_msg, sizeof write_msg);
+    comm.recv_result(result.get_request());
 
-        return result;
+    return result;
 }
 #endif
 
diff --git a/src/ham/net/communicator_mpi_rma_dynamic.cpp b/src/ham/net/communicator_mpi_rma_dynamic.cpp
new file mode 100644
index 0000000..e4e5dbd
--- /dev/null
+++ b/src/ham/net/communicator_mpi_rma_dynamic.cpp
@@ -0,0 +1,9 @@
+// Copyright (c) 2013-2014 Matthias Noack (ma.noack.pr@gmail.com)
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include "ham/net/communicator.hpp"
+
+ham::net::communicator* ham::net::communicator::instance_ = nullptr;
+

From cc24b4f647b1685c88e6159ddb8f3bbbb4a35f86 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Wed, 4 Apr 2018 14:45:08 +0200
Subject: [PATCH 017/150] bugfixes

---
 Jamroot                                       | 99 ++++++++++---------
 include/ham/net/communicator.hpp              |  2 +-
 .../ham/net/communicator_mpi_rma_dynamic.hpp  | 53 +++++++---
 include/ham/offload/offload.hpp               | 44 +++++----
 include/ham/offload/offload_msg.hpp           |  2 +-
 src/inner_product.cpp                         | 33 +++++--
 src/test_data_transfer.cpp                    | 38 ++++++-
 tools/install_boost.sh                        |  2 +-
 8 files changed, 175 insertions(+), 98 deletions(-)

diff --git a/Jamroot b/Jamroot
index 0e06729..ccccfe3 100644
--- a/Jamroot
+++ b/Jamroot
@@ -24,7 +24,7 @@ rule get-boost-lib-path ( properties * )
 }
 
 lib boost_program_options : : <name>boost_program_options <conditional>@get-boost-lib-path ;
-lib scif : : <name>scif ;
+# lib scif : : <name>scif ;
 
 project HAM
 	: source-location $(SRC)
@@ -36,7 +36,7 @@ project HAM
 #	<toolset>intel:<cflags>"-static-intel"
 	<inlining>on # off, on, full
 	#<optimization>speed # off, speed, space
-	<cxxflags>"-std=c++11"
+	<cxxflags>"-hstd=c++11"
 	<threading>multi
 #	<link>static
 	: default-build debug release debug_mic release_mic
@@ -56,25 +56,25 @@ obj offload_obj_mpi : ham/offload/offload.cpp : <library>/mpi//mpi <define>HAM_C
 
 constant OBJ_FILES_MPI : communicator_obj_mpi runtime_obj_mpi offload_obj_mpi communicator_mpi_obj_mpi ;
 
-obj communicator_obj_mpi_rma_dyn : ham/net/communicator.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC ;
-obj communicator_mpi_rma_dyn_obj_mpi_rma_dyn : ham/net/communicator_mpi_rma_dynamic.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC ;
-obj runtime_obj_mpi_rma_dyn : ham/offload/runtime.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC ;
-obj offload_obj_mpi_rma_dyn : ham/offload/offload.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC ;
+obj communicator_obj_mpi_rma_dyn : ham/net/communicator.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC <define>HAM_DEBUG_ON ;
+obj communicator_mpi_rma_dyn_obj_mpi_rma_dyn : ham/net/communicator_mpi_rma_dynamic.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC <define>HAM_DEBUG_ON ;
+obj runtime_obj_mpi_rma_dyn : ham/offload/runtime.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC <define>HAM_DEBUG_ON ;
+obj offload_obj_mpi_rma_dyn : ham/offload/offload.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC <define>HAM_DEBUG_ON ;
 
 constant OBJ_FILES_MPI_RMA_DYN : communicator_obj_mpi_rma_dyn communicator_mpi_rma_dyn_obj_mpi_rma_dyn runtime_obj_mpi_rma_dyn offload_obj_mpi_rma_dyn ;
 
-obj communicator_obj_scif : ham/net/communicator.cpp : <library>scif <define>HAM_COMM_SCIF ;
-obj communicator_scif_obj_scif : ham/net/communicator_scif.cpp : <library>scif <define>HAM_COMM_SCIF ;
-obj runtime_obj_scif : ham/offload/runtime.cpp : <library>scif <define>HAM_COMM_SCIF ;
-obj offload_obj_scif : ham/offload/offload.cpp : <library>scif <define>HAM_COMM_SCIF ;
+# obj communicator_obj_scif : ham/net/communicator.cpp : <library>scif <define>HAM_COMM_SCIF ;
+# obj communicator_scif_obj_scif : ham/net/communicator_scif.cpp : <library>scif <define>HAM_COMM_SCIF ;
+# obj runtime_obj_scif : ham/offload/runtime.cpp : <library>scif <define>HAM_COMM_SCIF ;
+# obj offload_obj_scif : ham/offload/offload.cpp : <library>scif <define>HAM_COMM_SCIF ;
 
-constant OBJ_FILES_SCIF : communicator_obj_scif runtime_obj_scif offload_obj_scif communicator_scif_obj_scif ;
+# constant OBJ_FILES_SCIF : communicator_obj_scif runtime_obj_scif offload_obj_scif communicator_scif_obj_scif ;
 
 # Libraries
 
 obj main_obj_mpi : ham/offload/main.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI ;
-obj main_obj_mpi_rma_dyn : ham/offload/main.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC ;
-obj main_obj_scif : ham/offload/main.cpp : <library>scif <define>HAM_COMM_SCIF ;
+obj main_obj_mpi_rma_dyn : ham/offload/main.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC <define>HAM_DEBUG_ON ;
+# obj main_obj_scif : ham/offload/main.cpp : <library>scif <define>HAM_COMM_SCIF ;
 
 lib ham_offload_mpi
 	: $(OBJ_FILES_COMMON) $(OBJ_FILES_MPI) main_obj_mpi boost_program_options
@@ -83,17 +83,17 @@ lib ham_offload_mpi
 
 lib ham_offload_mpi_rma_dyn
     : $(OBJ_FILES_COMMON) $(OBJ_FILES_MPI_RMA_DYN) main_obj_mpi_rma_dyn boost_program_options
-    : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC
+    : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC <define>HAM_DEBUG_ON
     ;
 
-lib ham_offload_scif
-	: $(OBJ_FILES_COMMON) $(OBJ_FILES_SCIF) main_obj_scif boost_program_options
-	: <library>scif <define>HAM_COMM_SCIF
-	;
+# lib ham_offload_scif
+#	: $(OBJ_FILES_COMMON) $(OBJ_FILES_SCIF) main_obj_scif boost_program_options
+#	: <library>scif <define>HAM_COMM_SCIF
+#	;
 
 obj main_explicit_obj_mpi : ham/offload/main_explicit.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI ;
-obj main_explicit_obj_mpi_rma_dyn : ham/offload/main_explicit.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC ;
-obj main_explicit_obj_scif : ham/offload/main_explicit.cpp : <library>scif <define>HAM_COMM_SCIF ;
+obj main_explicit_obj_mpi_rma_dyn : ham/offload/main_explicit.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC <define>HAM_DEBUG_ON ;
+# obj main_explicit_obj_scif : ham/offload/main_explicit.cpp : <library>scif <define>HAM_COMM_SCIF ;
 
 lib ham_offload_mpi_explicit
 	: $(OBJ_FILES_COMMON) $(OBJ_FILES_MPI) main_explicit_obj_mpi boost_program_options
@@ -102,12 +102,13 @@ lib ham_offload_mpi_explicit
 
 lib ham_offload_mpi_rma_dyn_explicit
     : $(OBJ_FILES_COMMON) $(OBJ_FILES_MPI_RMA_DYN) main_explicit_obj_mpi_rma_dyn boost_program_options
-    : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC <define>HAM_EXPLICIT
+    : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC <define>HAM_EXPLICIT <define>HAM_DEBUG_ON
+    ;
 
-lib ham_offload_scif_explicit
-	: $(OBJ_FILES_COMMON) $(OBJ_FILES_SCIF) main_explicit_obj_scif boost_program_options
-	: <library>scif <define>HAM_COMM_SCIF <define>HAM_EXPLICIT
-	;
+# lib ham_offload_scif_explicit
+#	: $(OBJ_FILES_COMMON) $(OBJ_FILES_SCIF) main_explicit_obj_scif boost_program_options
+#	: <library>scif <define>HAM_COMM_SCIF <define>HAM_EXPLICIT
+#	;
 
 # Benchmarks
 
@@ -117,17 +118,17 @@ exe benchmark_ham_offload_mpi
 	: <library>/mpi//mpi <library>ham_offload_mpi
 	;	
 
-obj benchmark_ham_offload_mpi_rma_dyn_obj : benchmark_ham_offload.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC ;
+obj benchmark_ham_offload_mpi_rma_dyn_obj : benchmark_ham_offload.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC <define>HAM_DEBUG_ON ;
 exe benchmark_ham_offload_mpi_rma_dyn
     : benchmark_ham_offload_mpi_rma_dyn_obj boost_program_options
-    : <library>/mpi//mpi <library>ham_offload_mpi_rma_dyn
+    : <library>/mpi//mpi <library>ham_offload_mpi_rma_dyn <define>HAM_DEBUG_ON
     ;
 
-obj benchmark_ham_offload_scif_obj : benchmark_ham_offload.cpp : <library>scif <define>HAM_COMM_SCIF ;
-exe benchmark_ham_offload_scif
-	: benchmark_ham_offload_scif_obj boost_program_options ham_offload_scif
-	: <library>scif 
-	;	
+# obj benchmark_ham_offload_scif_obj : benchmark_ham_offload.cpp : <library>scif <define>HAM_COMM_SCIF ;
+# exe benchmark_ham_offload_scif
+#	: benchmark_ham_offload_scif_obj boost_program_options ham_offload_scif
+#	: <library>scif
+#	;
 
 exe benchmark_intel_leo
 	: benchmark_intel_leo.cpp boost_program_options
@@ -154,14 +155,14 @@ exe ham_offload
 exe ham_offload_explicit
 	: ham_offload_explicit.cpp ham_offload_mpi_rma_dyn_explicit boost_program_options
 #	: <library>/mpi//mpi <define>HAM_COMM_MPI
-	: <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC
+	: <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC <define>HAM_DEBUG_ON
 #	: <library>scif <define>HAM_COMM_SCIF
 	;
 
-exe inner_product_scif
-	: [ obj inner_product_obj : inner_product.cpp : <library>scif <define>HAM_COMM_SCIF ] ham_offload_scif boost_program_options
-	: <library>scif <define>HAM_COMM_SCIF
-	;
+# exe inner_product_scif
+#	: [ obj inner_product_obj : inner_product.cpp : <library>scif <define>HAM_COMM_SCIF ] ham_offload_scif boost_program_options
+#	: <library>scif <define>HAM_COMM_SCIF
+#	;
 
 exe inner_product_mpi
 	: [ obj inner_product_obj : inner_product.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI ] ham_offload_mpi boost_program_options
@@ -170,13 +171,13 @@ exe inner_product_mpi
 
 exe inner_product_mpi_rma_dynamic
     : [ obj inner_product_obj : inner_product.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI ] ham_offload_mpi_rma_dyn boost_program_options
-    : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC
+    : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC <define>HAM_DEBUG_ON
     ;
 
-exe test_data_transfer_scif
-	: [ obj test_data_transfer_obj : test_data_transfer.cpp : <library>scif <define>HAM_COMM_SCIF ] ham_offload_scif boost_program_options
-	: <library>scif <define>HAM_COMM_SCIF
-	;
+# exe test_data_transfer_scif
+#	: [ obj test_data_transfer_obj : test_data_transfer.cpp : <library>scif <define>HAM_COMM_SCIF ] ham_offload_scif boost_program_options
+#	: <library>scif <define>HAM_COMM_SCIF
+#	;
 
 exe test_data_transfer_mpi
 	: [ obj test_data_transfer_obj : test_data_transfer.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI ] ham_offload_mpi boost_program_options
@@ -185,13 +186,13 @@ exe test_data_transfer_mpi
 
 exe test_data_transfer_mpi_rma_dynamic
 	: [ obj test_data_transfer_obj : test_data_transfer.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC ] ham_offload_mpi_rma_dyn boost_program_options
-	: <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC
+	: <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC <define>HAM_DEBUG_ON
 	;
 
-exe test_argument_transfer_scif
-	: [ obj test_argument_transfer_obj : test_argument_transfer.cpp : <library>scif <define>HAM_COMM_SCIF ] ham_offload_scif boost_program_options
-	: <library>scif <define>HAM_COMM_SCIF
-	;
+# exe test_argument_transfer_scif
+# 	: [ obj test_argument_transfer_obj : test_argument_transfer.cpp : <library>scif <define>HAM_COMM_SCIF ] ham_offload_scif boost_program_options
+#	: <library>scif <define>HAM_COMM_SCIF
+#	;
 
 exe test_argument_transfer_mpi
 	: [ obj test_argument_transfer_obj : test_argument_transfer.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI ] ham_offload_mpi boost_program_options
@@ -200,9 +201,9 @@ exe test_argument_transfer_mpi
 
 exe test_argument_transfer_mpi_rma_dynamic
 	: [ obj test_argument_transfer_obj : test_argument_transfer.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC ] ham_offload_mpi_rma_dyn boost_program_options
-	: <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC
+	: <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC <define>HAM_DEBUG_ON
 	;
 
 # Explicit targets (not built by default)
 explicit benchmark_intel_leo ;
-explicit test_data_transfer_scif ;
+# explicit test_data_transfer_scif ;
diff --git a/include/ham/net/communicator.hpp b/include/ham/net/communicator.hpp
index 4e84e2b..c754f99 100644
--- a/include/ham/net/communicator.hpp
+++ b/include/ham/net/communicator.hpp
@@ -50,7 +50,7 @@ namespace net {
 #define HAM_COMM_ONE_SIDED
 #include "ham/net/communicator_scif.hpp"
 #elif defined HAM_COMM_MPI_RMA_DYNAMIC
-#include "ham/net/communicator_scif.hpp"
+#include "ham/net/communicator_mpi_rma_dynamic.hpp"
 #else
 static_assert(false, "Please define either HAM_COMM_MPI, HAM_COMM_MPI_RMA_DYNAMIC or HAM_COMM_SCIF.");
 #endif
diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index ebe7a10..3e28440 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -70,7 +70,7 @@ class communicator {
 		request() : valid_(false) {} // instantiate invalid
 		
 		request(node_t target_node, node_t source_node, size_t send_buffer_index, size_t recv_buffer_index)
-		 : target_node(target_node), source_node(source_node), valid_(true), send_buffer_index(send_buffer_index), recv_buffer_index(recv_buffer_index), req_count(0)
+		 : target_node(target_node), source_node(source_node), valid_(true), send_buffer_index(send_buffer_index), recv_buffer_index(recv_buffer_index), req_count(0), uses_rma_(false)
 		{}
 
 		// return true if request was finished
@@ -80,7 +80,7 @@ class communicator {
 			int flag = 0;
 			MPI_Testall(req_count, mpi_reqs, &flag, MPI_STATUS_IGNORE); // just test the receive request, since the send belonging to the request triggers the remote send that is received
 
-            if(uses_rma)
+            if(uses_rma_)
             {
                 HAM_DEBUG( HAM_LOG << "request::test(), warning: may give false positive on rma remote completion" << std::endl; )
             }
@@ -93,9 +93,9 @@ class communicator {
 			HAM_DEBUG( HAM_LOG << "request::get(), before MPI_Waitall()" << std::endl; )
 			MPI_Waitall(req_count, mpi_reqs, MPI_STATUS_IGNORE); // must wait for all requests to satisfy the standard
 			HAM_DEBUG( HAM_LOG << "request::get(), after MPI_Waitall()" << std::endl; )
-            if(uses_rma)
+            if(uses_rma_)
             {
-                MPI_Win_unlock(target_node, rma_win);
+                MPI_Win_unlock(target_node, communicator::instance().rma_win);
             }
 			return static_cast<void*>(&communicator::instance().peers[target_node].msg_buffers[recv_buffer_index]);
 		}
@@ -130,7 +130,7 @@ class communicator {
 		node_t target_node;
 		node_t source_node;
 		bool valid_;
-        bool uses_rma;
+        bool uses_rma_;
 
 		// only needed by the sender
 		enum { NUM_REQUESTS = 3 };
@@ -198,7 +198,7 @@ class communicator {
 		if (is_host()) {
 			for (node_t i = 1; i < nodes_; ++i) { // TODO(improvement): needs to be changed when host-rank becomes configurable
 				// allocate buffers
-				peers[i].msg_buffers = allocate_buffer<msg_buffer>(constants::MSG_BUFFERS, this_node_);
+				peers[i].msg_buffers = allocate_peer_buffer<msg_buffer>(constants::MSG_BUFFERS, this_node_);
 				// fill resource pools
 				for(size_t j = constants::MSG_BUFFERS; j > 0; --j) {
 					peers[i].buffer_pool.add(j-1);
@@ -268,7 +268,7 @@ class communicator {
 	{
 		//MPI_Send((void*)local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), constants::DATA_TAG, MPI_COMM_WORLD);
         MPI_Win_lock(MPI_LOCK_SHARED, remote_dest.node(), 0, rma_win);
-        MPI_Put(local_source, size, MPI_BYTE, remote_dest.node(), (void *) remote_dest.get_mpi_address(), size, MPI_BYTE, rma_win);
+        MPI_Put(local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), remote_dest.get_mpi_address(), size * sizeof(T), MPI_BYTE, rma_win);
         MPI_Win_unlock(remote_dest.node(), rma_win);
 	}
 
@@ -277,10 +277,10 @@ class communicator {
 	void send_data_async(request_reference_type req, T* local_source, buffer_ptr<T> remote_dest, size_t size)
 	{
 		//MPI_Isend((void*)local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), constants::DATA_TAG, MPI_COMM_WORLD, &req.next_mpi_request());
-        req.uses_rma = true;
+        req.uses_rma_ = true;
 
         MPI_Win_lock(MPI_LOCK_SHARED, remote_dest.node(), 0, rma_win);
-        MPI_Rput(local_source, size, MPI_BYTE, remote_dest.node(), (void *) remote_dest.get_mpi_address(), size, MPI_BYTE, rma_win, &re.next_mpi_request());
+        MPI_Rput(local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), remote_dest.get_mpi_address(), size * sizeof(T), MPI_BYTE, rma_win, &req.next_mpi_request());
 	}
 
 
@@ -288,8 +288,9 @@ class communicator {
 	void recv_data(buffer_ptr<T> remote_source, T* local_dest, size_t size)
 	{
 		//MPI_Recv((void*)local_dest, size * sizeof(T), MPI_BYTE, remote_source.node(), constants::DATA_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-        MPI_Get(remote_source, size, MPI_BYTE, remote_source.node(), (void *) remote_dest.get_mpi_address(), size, MPI_BYTE, rma_win);
-        MPI_Win_flush(remote_source.node(), rma_win);
+		MPI_Win_lock(MPI_LOCK_SHARED, remote_source.node(), 0, rma_win);
+		MPI_Get(remote_source, size * sizeof(T), MPI_BYTE, remote_source.node(), remote_dest.get_mpi_address(), size * sizeof(T), MPI_BYTE, rma_win);
+		MPI_Win_unlock(remote_source.node(), rma_win);
 	}
 	
 	// to be used by the host
@@ -297,8 +298,9 @@ class communicator {
 	void recv_data_async(request_reference_type req, buffer_ptr<T> remote_source, T* local_dest, size_t size)
 	{
 		//MPI_Irecv(static_cast<void*>(local_dest), size * sizeof(T), MPI_BYTE, remote_source.node(), constants::DATA_TAG, MPI_COMM_WORLD, &req.next_mpi_request());
-        req.uses_rma = true;
-        MPI_RGet(remote_source, size, MPI_BYTE, remote_source.node(), (void *) remote_dest.get_mpi_address(), size, MPI_BYTE, rma_win, &req.next_mpi_request());
+        req.uses_rma_ = true;
+		MPI_Win_lock(MPI_LOCK_SHARED, remote_source.node(), 0, rma_win);
+		MPI_Rget(local_dest, size * sizeof(T), MPI_BYTE, remote_source.node(), remote_source.get_mpi_address(), size * sizeof(T), MPI_BYTE, rma_win, &req.next_mpi_request());
 	}
 
 	template<typename T>
@@ -307,12 +309,25 @@ class communicator {
 		T* ptr;
 		//int err =
 		posix_memalign((void**)&ptr, constants::CACHE_LINE_SIZE, n * sizeof(T));
-        MPI_Aint mpi_address;
-        MPI_Win_attach(rma_win, (void *) &mpi_address, n * sizeof(T));
+		MPI_Win_attach(rma_win, (void*)ptr, n * sizeof(T));
+		MPI_Aint mpi_address;
+		MPI_Get_address((void*)ptr, &mpi_address);
 		// NOTE: no ctor is called
 		return buffer_ptr<T>(ptr, this_node_, mpi_address);
 	}
 
+	// for host to allocate peer message buffers, needed because original function now manages rma window which must not happen for host-only local buffers
+	template<typename T>
+	buffer_ptr<T> allocate_peer_buffer(const size_t n, node_t source_node)
+	{
+		T* ptr;
+		//int err =
+		posix_memalign((void**)&ptr, constants::CACHE_LINE_SIZE, n * sizeof(T));
+		// NOTE: no ctor is called
+		return buffer_ptr<T>(ptr, this_node_);
+	}
+
+	// for host to free peer message buffers, needed because original function now manages rma window which must not happen for host-only local buffers
 	template<typename T>
 	void free_buffer(buffer_ptr<T> ptr)
 	{
@@ -322,6 +337,14 @@ class communicator {
 		free(static_cast<void*>(ptr.get()));
 	}
 
+	template<typename T>
+	void free_peer_buffer(buffer_ptr<T> ptr)
+	{
+		assert(ptr.node() == this_node_);
+		// NOTE: no dtor is called
+		free(static_cast<void*>(ptr.get()));
+	}
+
 	static communicator& instance() { return *instance_; }
 	static node_t this_node() { return instance().this_node_; }
 	static size_t num_nodes() { return instance().nodes_; }
diff --git a/include/ham/offload/offload.hpp b/include/ham/offload/offload.hpp
index f721597..b44451e 100644
--- a/include/ham/offload/offload.hpp
+++ b/include/ham/offload/offload.hpp
@@ -273,9 +273,10 @@ future<void> get(buffer_ptr<T> remote_source, T* local_dest, size_t n)
 
 	return result;
 #elif defined HAM_COMM_MPI_RMA_DYNAMIC
-	future<void> result(comm.allocate_request(remote_dest.node()));
+	future<void> result(comm.allocate_request(remote_source.node()));
 	HAM_DEBUG( HAM_LOG << "offload::put(): initiating RMA get..." << std::endl; )
 	comm.recv_data_async(result.get_request(), remote_source, local_dest, n);
+    return result;
 #endif
 }
 
@@ -308,8 +309,28 @@ void get_sync(buffer_ptr<T> remote_source, T* local_dest, size_t n)
 
 //}
 
+#ifdef HAM_COMM_MPI_RMA_DYNAMIC // compile-integration pending
+        template<typename T>
+future<void> copy(buffer_ptr<T> source, buffer_ptr<T> dest, size_t n)
+{
+    net::communicator& comm = runtime::instance().communicator();
+
+    // make sure there is no winlock on dest from host
+    // solution: shared lock, unlocking from host not necessary
+
+    // issues a put on the source node targeting the destination node
+    future<void> result(comm.allocate_request(source.node()));
+    HAM_DEBUG( HAM_LOG << "offload::copy_sync(): initiating copy between " << source.node() << " and " << dest.node() << std::endl; )
+    auto copy_msg = detail::offload_rma_copy_msg<T>(result.get_request(), dest.node(), dest.get_mpi_address(), source.get(), n);
+    comm.send_msg(result.get_request(), (void*)&copy_msg, sizeof copy_msg);
+    comm.recv_result(result.get_request());
+
+    return result;
+}
+#endif
+
 #ifndef HAM_COMM_ONE_SIDED // TODO(feature, high priority): implement
-template<typename T>
+        template<typename T>
 void copy_sync(buffer_ptr<T> source, buffer_ptr<T> dest, size_t n)
 {
 	net::communicator& comm = runtime::instance().communicator();
@@ -341,27 +362,8 @@ void copy_sync(buffer_ptr<T> source, buffer_ptr<T> dest, size_t n)
     copy(source, dest, n).get();
 #endif
 }
-#endif
-
-
-#ifdef HAM_COMM_MPI_RMA_DYNAMIC // compile-integration pending
-template<typename T>
-future<void> copy(buffer_ptr<T> source, buffer_ptr<T> dest, size_t n)
-{
-    net::communicator& comm = runtime::instance().communicator();
 
-    // make sure there is no winlock on dest from host
-    // solution: shared lock, unlocking from host not necessary
-
-    // issues a put on the source node targeting the destination node
-    future<void> result(comm.allocate_request(source.node()));
-    HAM_DEBUG( HAM_LOG << "offload::copy_sync(): initiating copy between " << source.node() << " and " << dest.node() << std::endl; )
-    auto copy_msg = detail::offload_rma_copy_msg<T>(result.get_request(), dest.node(), dest.get_mpi_address(), source.get(), n);
-    comm.send_msg(result.get_request(), (void*)&copy_msg, sizeof write_msg);
-    comm.recv_result(result.get_request());
 
-    return result;
-}
 #endif
 
 // TODO(feature): new API elements
diff --git a/include/ham/offload/offload_msg.hpp b/include/ham/offload/offload_msg.hpp
index b16a8a0..90e0fee 100644
--- a/include/ham/offload/offload_msg.hpp
+++ b/include/ham/offload/offload_msg.hpp
@@ -147,7 +147,7 @@ class offload_read_msg
 
         void operator()() //const
         {
-            communicator::instance().send_data(local_source, buffer_ptr<T>(nullptr, remote_node, remote_addr), n); // NOTE: Why nullptr? This is for two-sided communicators, so we do not know the remote address, but match a receive operation that has the address.
+            communicator::instance().send_data(local_source, buffer_ptr<T>(nullptr, remote_node, remote_addr), n);
 
             // send a result message to tell the sender, that the transfer is done
             if (req.valid()) {
diff --git a/src/inner_product.cpp b/src/inner_product.cpp
index 3dc1c60..a988abc 100644
--- a/src/inner_product.cpp
+++ b/src/inner_product.cpp
@@ -17,6 +17,15 @@ double inner_product(offload::buffer_ptr<double> x, offload::buffer_ptr<double>
 	return z;
 }
 
+bool print_buffer_content(offload::buffer_ptr<double> x, size_t n)
+{
+	std::cout << "printing data on node " << x.node() << std::endl;
+	for (size_t i = 0; i < n; ++i)
+		std::cout << x[i] << " ";
+	std::cout << std::endl;
+	return true;
+}
+
 int main(int argc, char* argv[])
 {
 	// buffer size
@@ -40,20 +49,26 @@ int main(int argc, char* argv[])
 
 	// allocate device memory (returns a buffer_ptr<T>)
 	auto a_target = offload::allocate<double>(target, n);
-	auto b_target = offload::allocate<double>(target, n);
-	
+	std::cout << "allocated remote buffer 1" << std::endl;
+	//auto b_target = offload::allocate<double>(target, n);
+	//std::cout << "allocated remote buffer 2" << std::endl;
+
+
 	// transfer data to the device (the target is implicitly specified by the destination buffer_ptr)	
-	auto future_a_put = offload::put(a.data(), a_target, n); // async
-	offload::put(b.data(), b_target, n); // sync (implicitly returned future performs synchronisation in dtor), alternative: put_sync()
+	//auto future_a_put = offload::put(a.data(), a_target, n); // async
+	offload::put(a.data(), a_target, n); // sync
+	//offload::put(b.data(), b_target, n); // sync (implicitly returned future performs synchronisation in dtor), alternative: put_sync()
 	
 	// synchronise
-	future_a_put.get();
-	
+	//future_a_put.get();
+
+	std::cout << "completed put" << std::endl;
+
 	// asynchronously offload the call to inner_product
-	auto c_future = offload::async(target, f2f(&inner_product, a_target, b_target, n));
+	//auto c_future = offload::async(target, f2f(&inner_product, a_target, b_target, n));
 
 	// synchronise on the result
-	double c = c_future.get(); 
+	//double c = c_future.get();
 
 	// we also could have used:
 	// double c = offload::async(...).get();
@@ -62,7 +77,7 @@ int main(int argc, char* argv[])
 	// offload.async(...);
 
 	// output the result
-	std::cout << "Result: " << c << std::endl;
+	//std::cout << "Result: " << c << std::endl;
 	
 	return 0;	
 }
diff --git a/src/test_data_transfer.cpp b/src/test_data_transfer.cpp
index d53eb3e..a58569c 100644
--- a/src/test_data_transfer.cpp
+++ b/src/test_data_transfer.cpp
@@ -27,6 +27,15 @@ bool compare(const std::vector<T>& a, const std::vector<T>& b)
 	return std::equal(a.begin(), a.end(), b.begin());
 }
 
+double print_buffer_content(offload::buffer_ptr<double> x, size_t n)
+{
+	std::cout << "printing data on node " << x.node() << std::endl;
+	for (size_t i = 0; i < n; ++i)
+		std::cout << x[i] << " ";
+	std::cout << std::endl;
+	return 50.0;
+}
+
 int main(int argc, char* argv[])
 {
 	std::cout << "Testing data transfer: host -> target_a -> target_b -> host." << std::endl;
@@ -48,12 +57,39 @@ int main(int argc, char* argv[])
 	// allocate device memory (returns a buffer_ptr<T>)
 	auto target_buffer_a = offload::allocate<double>(target_a, n);
 	auto target_buffer_b = offload::allocate<double>(target_b, n);
+
+	offload::sync(target_a, f2f(&print_buffer_content, target_buffer_a, n));
+
+	std::cout << "a - get: " << target_buffer_a.get() << std::endl;
+	std::cout << "a - node: " << target_buffer_a.node() << std::endl;
+
+#ifdef HAM_COMM_MPI_RMA_DYNAMIC
+	std::cout << "a - mpi: " << target_buffer_a.get_mpi_address() << std::endl;
+#endif
+
+	std::cout << "b - get: " << target_buffer_b.get() << std::endl;
+	std::cout << "b - node: " << target_buffer_b.node() << std::endl;
+
+#ifdef HAM_COMM_MPI_RMA_DYNAMIC
+	std::cout << "b - mpi: " << target_buffer_b.get_mpi_address() << std::endl;
+#endif
 	
 	// host -> target_a -> target_b -> host
+	std::cout << "put to target_a: ";
 	offload::put(write_buffer.data(), target_buffer_a, n);
+	std::cout << "done" << std::endl;
+
+	offload::sync(target_a, f2f(&print_buffer_content, target_buffer_a, n));
+
+	std::cout << "copy from target_a to target_b: ";
 	offload::copy_sync(target_buffer_a, target_buffer_b, n);
+	std::cout << "done" << std::endl;
+
+	offload::async(target_b, f2f(&print_buffer_content, target_buffer_b, n));
+
+	std::cout << "get from target_b: ";
 	offload::get(target_buffer_b, read_buffer.data(), n);
-	
+	std::cout << "done" << std::endl;
 	// verify
 	bool passed = compare(write_buffer, read_buffer);
 	
diff --git a/tools/install_boost.sh b/tools/install_boost.sh
index 4f30ddf..9b91667 100755
--- a/tools/install_boost.sh
+++ b/tools/install_boost.sh
@@ -35,7 +35,7 @@
 
 DOWNLOAD_PATH=$HOME/boost/
 INSTALL_PATH=$HOME/software
-NO_MIC=false # set to true, to disable building Boost for Xeon Phi
+NO_MIC=true # set to true, to disable building Boost for Xeon Phi
 BASHRC_FILE=$HOME/.bashrc # set to /dev/null to disable, or to any other file to manually merge the needed changes into your .bashrc 
 
 BOOST_BUILD_OPTIONS="-j8" # concurrent build with up to 8 commands

From a29c877796276cd7a186714631eaddcd81ef61de Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Thu, 5 Apr 2018 12:45:23 +0200
Subject: [PATCH 018/150] segfault fixes

---
 Jamroot                    |  4 ++--
 src/inner_product.cpp      | 21 ++++++++++-----------
 src/test_data_transfer.cpp | 13 +++++++------
 3 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/Jamroot b/Jamroot
index ccccfe3..4fb6664 100644
--- a/Jamroot
+++ b/Jamroot
@@ -170,8 +170,8 @@ exe inner_product_mpi
 	;
 
 exe inner_product_mpi_rma_dynamic
-    : [ obj inner_product_obj : inner_product.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI ] ham_offload_mpi_rma_dyn boost_program_options
-    : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC <define>HAM_DEBUG_ON
+    : [ obj inner_product_obj : inner_product.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC ] ham_offload_mpi_rma_dyn boost_program_options
+    : <library>/mpi//mpi <cflags>-g <define>HAM_COMM_MPI_RMA_DYNAMIC <define>HAM_DEBUG_ON
     ;
 
 # exe test_data_transfer_scif
diff --git a/src/inner_product.cpp b/src/inner_product.cpp
index a988abc..f959eee 100644
--- a/src/inner_product.cpp
+++ b/src/inner_product.cpp
@@ -50,25 +50,24 @@ int main(int argc, char* argv[])
 	// allocate device memory (returns a buffer_ptr<T>)
 	auto a_target = offload::allocate<double>(target, n);
 	std::cout << "allocated remote buffer 1" << std::endl;
-	//auto b_target = offload::allocate<double>(target, n);
-	//std::cout << "allocated remote buffer 2" << std::endl;
+	auto b_target = offload::allocate<double>(target, n);
+	std::cout << "allocated remote buffer 2" << std::endl;
 
 
 	// transfer data to the device (the target is implicitly specified by the destination buffer_ptr)	
-	//auto future_a_put = offload::put(a.data(), a_target, n); // async
-	offload::put(a.data(), a_target, n); // sync
-	//offload::put(b.data(), b_target, n); // sync (implicitly returned future performs synchronisation in dtor), alternative: put_sync()
-	
-	// synchronise
-	//future_a_put.get();
+	auto future_a_put = offload::put(a.data(), a_target, n); // async
+    offload::put(b.data(), b_target, n); // sync (implicitly returned future performs synchronisation in dtor), alternative: put_sync()
+
+    // synchronise
+    future_a_put.get();
 
 	std::cout << "completed put" << std::endl;
 
 	// asynchronously offload the call to inner_product
-	//auto c_future = offload::async(target, f2f(&inner_product, a_target, b_target, n));
+	auto c_future = offload::async(target, f2f(&inner_product, a_target, b_target, n));
 
 	// synchronise on the result
-	//double c = c_future.get();
+	double c = c_future.get();
 
 	// we also could have used:
 	// double c = offload::async(...).get();
@@ -77,7 +76,7 @@ int main(int argc, char* argv[])
 	// offload.async(...);
 
 	// output the result
-	//std::cout << "Result: " << c << std::endl;
+	std::cout << "Result: " << c << std::endl;
 	
 	return 0;	
 }
diff --git a/src/test_data_transfer.cpp b/src/test_data_transfer.cpp
index a58569c..cb8c60d 100644
--- a/src/test_data_transfer.cpp
+++ b/src/test_data_transfer.cpp
@@ -76,15 +76,16 @@ int main(int argc, char* argv[])
 	
 	// host -> target_a -> target_b -> host
 	std::cout << "put to target_a: ";
-	offload::put(write_buffer.data(), target_buffer_a, n);
-	std::cout << "done" << std::endl;
+	auto put_future = offload::put(write_buffer.data(), target_buffer_a, n);
+    put_future.get();
+    std::cout << "done" << std::endl;
 
-	offload::sync(target_a, f2f(&print_buffer_content, target_buffer_a, n));
 
-	std::cout << "copy from target_a to target_b: ";
-	offload::copy_sync(target_buffer_a, target_buffer_b, n);
-	std::cout << "done" << std::endl;
+    offload::sync(target_a, f2f(&print_buffer_content, target_buffer_a, n));
 
+    std::cout << "copy from target_a to target_b: ";
+    offload::copy_sync(target_buffer_a, target_buffer_b, n);
+    std::cout << "done" << std::endl;
 	offload::async(target_b, f2f(&print_buffer_content, target_buffer_b, n));
 
 	std::cout << "get from target_b: ";

From 2a3dc90ce91d1ee35e22c5a31a05df8390ea25f3 Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Thu, 5 Apr 2018 18:38:28 +0200
Subject: [PATCH 019/150] started implementing pairwise dynamic windows

---
 .../ham/net/communicator_mpi_rma_dynamic.hpp  | 117 ++++++++++++++----
 include/ham/offload/offload_msg.hpp           |  33 ++++-
 src/inner_product.cpp                         |   7 +-
 3 files changed, 130 insertions(+), 27 deletions(-)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index 3e28440..1ef1320 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -165,7 +165,6 @@ class communicator {
 		MPI_Comm_size(MPI_COMM_WORLD, &t);
 		nodes_ = t;
 		host_node_ = 0; // TODO(improvement): make configureable, like for SCIF
-        MPI_Win_create_dynamic(MPI_INFO_NULL, MPI_COMM_WORLD, &rma_win);
 
 		HAM_DEBUG( std::cout << "communicator::communicator(): initialising MPI done" << std::endl; )
 
@@ -194,8 +193,12 @@ class communicator {
 		//MPI_Alltoall(&node_description, sizeof(node_descriptor), MPI_BYTE, node_descriptions.data(), sizeof(node_descriptor), MPI_BYTE, MPI_COMM_WORLD);
 		MPI_Allgather(&node_description, sizeof(node_descriptor), MPI_BYTE, node_descriptions.data(), sizeof(node_descriptor), MPI_BYTE, MPI_COMM_WORLD);
 		HAM_DEBUG( HAM_LOG << "communicator::communicator(): gathering node descriptions done" << std::endl; )
-		
+
+		// prepare global group to create pairwise groups
+		MPI_Comm_group(MPI_COMM_WORLD, &global_group);
+
 		if (is_host()) {
+
 			for (node_t i = 1; i < nodes_; ++i) { // TODO(improvement): needs to be changed when host-rank becomes configurable
 				// allocate buffers
 				peers[i].msg_buffers = allocate_peer_buffer<msg_buffer>(constants::MSG_BUFFERS, this_node_);
@@ -203,7 +206,30 @@ class communicator {
 				for(size_t j = constants::MSG_BUFFERS; j > 0; --j) {
 					peers[i].buffer_pool.add(j-1);
 				}
+
+				// init comm to target from pairwise subgroups
+				const int members[2] = {host_node_, i}; // NOTE: this implies new group rank is 0 for host, 1 for target
+				MPI_Group pairwise_group;
+				MPI_Group_incl(global_group, 2, members, &pairwise_group);
+				MPI_Comm_create_group(MPI_COMM_WORLD, pairwise_group, 0, &(peers[i].rma_comm));
+				MPI_Group_free(&pairwise_group); // no longer needed after COMM is created
+
+				// init win to target
+				MPI_Win_create_dynamic(MPI_INFO_NULL, peers[i].rma_comm, &(peers[i].rma_win));
 			}
+
+
+		} else {
+			// init comm to host from pairwise subgroup
+			const int members[2] = {host_node_, this_node_}; // NOTE: this implies new group rank = 0 for host, 1 for target
+			MPI_Group pairwise_group;
+			MPI_Group_incl(global_group, 2, members, &pairwise_group); // should match the corresponding subgroup on host for i = this_node_
+			MPI_Comm_create_group(MPI_COMM_WORLD, pairwise_group, 0, &(peers[host_node_].rma_comm));
+			MPI_Group_free(&pairwise_group); // no longer needed after COMM is created
+
+			// init win to host
+			MPI_Win_create_dynamic(MPI_INFO_NULL, peers[host_node_].rma_comm, &(peers[host_node_].rma_win));
+
 		}
 	}
 
@@ -250,7 +276,7 @@ class communicator {
 	void* recv_msg_host(void* msg = nullptr, size_t size = constants::MSG_SIZE)
 	{
 		static msg_buffer buffer; // NOTE !
-		MPI_Recv(&buffer, size, MPI_BYTE, host_node_, constants::DEFAULT_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+		MPI_Recv(&buffer, size, MPI_BYTE, MPI_ANY_SOURCE, constants::DEFAULT_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE); // changed source from host_node_ to MPI_ANY_SOURCE so targets may react to request for setting up rma paths
 		return static_cast<void*>(&buffer);
 	}
 
@@ -263,44 +289,57 @@ class communicator {
 		return;
 	}
 
+	// in MPI RMA backend only used by copy
+	// host uses async version
+	// targets don't send data to host as host uses rma get
 	template<typename T>
 	void send_data(T* local_source, buffer_ptr<T> remote_dest, size_t size)
 	{
-		//MPI_Send((void*)local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), constants::DATA_TAG, MPI_COMM_WORLD);
-        MPI_Win_lock(MPI_LOCK_SHARED, remote_dest.node(), 0, rma_win);
-        MPI_Put(local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), remote_dest.get_mpi_address(), size * sizeof(T), MPI_BYTE, rma_win);
-        MPI_Win_unlock(remote_dest.node(), rma_win);
+		// resolve rank for subgroup
+		int target_rank;
+		if(remote_dest.node() > this_node_) {
+			target_rank = 1;
+		} else {
+			target_rank = 0;
+		}
+		// execute transfer
+		MPI_Win_lock(MPI_LOCK_SHARED, target_rank, 0, peers[remote_dest.node()].rma_win);
+        MPI_Put(local_source, size * sizeof(T), MPI_BYTE, target_rank, remote_dest.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_dest.node()].rma_win);
+        MPI_Win_unlock(target_rank, peers[remote_dest.node()].rma_win);
 	}
 
-	// to be used by the host
+	// to be used by the host only
 	template<typename T>
 	void send_data_async(request_reference_type req, T* local_source, buffer_ptr<T> remote_dest, size_t size)
 	{
-		//MPI_Isend((void*)local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), constants::DATA_TAG, MPI_COMM_WORLD, &req.next_mpi_request());
         req.uses_rma_ = true;
 
-        MPI_Win_lock(MPI_LOCK_SHARED, remote_dest.node(), 0, rma_win);
-        MPI_Rput(local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), remote_dest.get_mpi_address(), size * sizeof(T), MPI_BYTE, rma_win, &req.next_mpi_request());
+		// resolving rank for subgroup not necessary, is always 1 for the target
+        MPI_Win_lock(MPI_LOCK_SHARED, 1, 0, peers[remote_dest.node()].rma_win);
+        MPI_Rput(local_source, size * sizeof(T), MPI_BYTE, 1, remote_dest.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_dest.node()].rma_win, &req.next_mpi_request());
 	}
 
-
+	// not used in MPI RMA backend
+	// host uses async version
+	// targets don't use get
+	// should be safe to remove
 	template<typename T>
 	void recv_data(buffer_ptr<T> remote_source, T* local_dest, size_t size)
 	{
-		//MPI_Recv((void*)local_dest, size * sizeof(T), MPI_BYTE, remote_source.node(), constants::DATA_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-		MPI_Win_lock(MPI_LOCK_SHARED, remote_source.node(), 0, rma_win);
-		MPI_Get(remote_source, size * sizeof(T), MPI_BYTE, remote_source.node(), remote_dest.get_mpi_address(), size * sizeof(T), MPI_BYTE, rma_win);
-		MPI_Win_unlock(remote_source.node(), rma_win);
+		MPI_Win_lock(MPI_LOCK_SHARED, 0, 0, peers[remote_source.node()].rma_win); // dummy rank number as if targets were to use recv_data from host
+		MPI_Get(remote_source, size * sizeof(T), MPI_BYTE, remote_source.node(), remote_dest.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_source.node()].rma_win);
+		MPI_Win_unlock(remote_source.node(), peers[remote_source.node()].rma_win);
 	}
 	
 	// to be used by the host
 	template<typename T>
 	void recv_data_async(request_reference_type req, buffer_ptr<T> remote_source, T* local_dest, size_t size)
 	{
-		//MPI_Irecv(static_cast<void*>(local_dest), size * sizeof(T), MPI_BYTE, remote_source.node(), constants::DATA_TAG, MPI_COMM_WORLD, &req.next_mpi_request());
         req.uses_rma_ = true;
-		MPI_Win_lock(MPI_LOCK_SHARED, remote_source.node(), 0, rma_win);
-		MPI_Rget(local_dest, size * sizeof(T), MPI_BYTE, remote_source.node(), remote_source.get_mpi_address(), size * sizeof(T), MPI_BYTE, rma_win, &req.next_mpi_request());
+
+		// resolving rank for subgroup not necessary, is always 1 for the target
+		MPI_Win_lock(MPI_LOCK_SHARED, 1, 0, peers[remote_source.node()].rma_win);
+		MPI_Rget(local_dest, size * sizeof(T), MPI_BYTE, 1, remote_source.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_source.node()].rma_win, &req.next_mpi_request());
 	}
 
 	template<typename T>
@@ -309,7 +348,7 @@ class communicator {
 		T* ptr;
 		//int err =
 		posix_memalign((void**)&ptr, constants::CACHE_LINE_SIZE, n * sizeof(T));
-		MPI_Win_attach(rma_win, (void*)ptr, n * sizeof(T));
+		MPI_Win_attach(peers[source_node].rma_win, (void*)ptr, n * sizeof(T)); // only attach to the window corresponding to the requesting node, is attached to potential target-target-windows on demand
 		MPI_Aint mpi_address;
 		MPI_Get_address((void*)ptr, &mpi_address);
 		// NOTE: no ctor is called
@@ -333,6 +372,8 @@ class communicator {
 	{
 		assert(ptr.node() == this_node_);
 		// NOTE: no dtor is called
+
+		// remove from all potential rma windows
         MPI_Win_detach(rma_win, ptr.get());
 		free(static_cast<void*>(ptr.get()));
 	}
@@ -356,20 +397,52 @@ class communicator {
 		return instance().node_descriptions[node];
 	}
 
+	// called to check if an rma path between two targets exists, sufficient to call on one of the two targets
+	bool has_rma_path(node_t target_node) {
+		// check if copy path exists
+		return !peers[remote_dest.node()].rma_win;
+	}
+
+	// called to establish an rma path between two targets for copy operations, needs to be called on both sides
+	void establish_rma_path(node_t target_node) {
+		if(!has_rma_path(target_node)) { // make sure there is not already an rma path
+			const int members[2];
+			// NOTE: protocol for target-target sub-ranks is: lower global rank: 0, higher global rank: 1
+			// thus rank for existing copy paths can be easily translated by comparing target rank to own rank
+			if(this_node_ > target_node) {
+				members[0] = target_node;
+				members[1] = this_node_;
+			} else {
+				members[0] = this_node_;
+				members[1] = target_node;
+			}
+			MPI_Group pairwise_group;
+			MPI_Group_incl(global_group, 2, members, &pairwise_group);
+			MPI_Comm_create_group(MPI_COMM_WORLD, pairwise_group, 0, &(peers[target_node].rma_comm));
+			MPI_Group_free(&pairwise_group); // no longer needed after COMM is created
+			MPI_Win_create_dynamic(MPI_INFO_NULL, peers[target_node].rma_comm, &(peers[target_node].rma_win));
+		}
+	}
+
+
 private:
 	static communicator* instance_;
 	node_t this_node_;
 	size_t nodes_;
 	node_t host_node_;
 	std::vector<node_descriptor> node_descriptions; // not as member in peer below, because Allgather is used to exchange node descriptions
-    MPI_Win rma_win; // globally shared dynamic window for rma ops
-		
+	MPI_Group global_group;
+
 	struct mpi_peer {
 		buffer_ptr<msg_buffer> msg_buffers; // buffers used for MPI_ISend and IRecv by the sender
 
 		// needed by sender to manage which buffers are in use and which are free
 		// just manages indices, that can be used by
 		detail::resource_pool<size_t> buffer_pool;
+
+		// mpi rma dynamic window
+		MPI_Win rma_win;
+		MPI_Comm rma_comm;
 	};
 	
 	mpi_peer* peers;
diff --git a/include/ham/offload/offload_msg.hpp b/include/ham/offload/offload_msg.hpp
index 90e0fee..e9f75b0 100644
--- a/include/ham/offload/offload_msg.hpp
+++ b/include/ham/offload/offload_msg.hpp
@@ -134,9 +134,7 @@ class offload_read_msg
 	size_t n;
 };
 
-
-// TODO(daniel, high priority): implement offload_copy_msg, copy with one-sided rma needs a msg containing ptrs for source+target
-//#ifdef SOME_COOL_VAR_FOR_MPI_RMA_DYN // compile-integration pending
+//#ifdef HAM_COMM_MPI_RMA_DYNAMIC
     template<typename T, template<class> class ExecutionPolicy = default_execution_policy>
     class offload_rma_copy_msg
             : public active_msg<offload_rma_copy_msg<T, ExecutionPolicy>, ExecutionPolicy>
@@ -147,6 +145,9 @@ class offload_read_msg
 
         void operator()() //const
         {
+            communicator::instance().establish_rma_path(remote_node); // should quickly return if path already exists
+            // attach existing buffers to new target window ?!?
+
             communicator::instance().send_data(local_source, buffer_ptr<T>(nullptr, remote_node, remote_addr), n);
 
             // send a result message to tell the sender, that the transfer is done
@@ -164,6 +165,32 @@ class offload_read_msg
     };
 //#endif
 
+// allows user to setup an rma link between two targets without a copy transfer
+#ifdef HAM_COMM_MPI_RMA_DYNAMIC
+    template<typename T, template<class> class ExecutionPolicy = default_execution_policy>
+    class setup_rma_path_msg
+            : public active_msg<setup_rma_path_msg<T, ExecutionPolicy>, ExecutionPolicy>
+    {
+    public:
+        setup_rma_path_msg(node_t remote_node)
+                : remote_node(remote_node) { }
+
+        void operator()() //const
+        {
+            communicator::instance().establish_rma_path(remote_node);
+
+            // send a result message to tell the sender that the path is set up
+            if (req.valid()) {
+                req.send_result((void*)&remote_node, sizeof remote_node);
+            }
+        }
+    private:
+        node_t remote_node;
+    };
+#endif
+
+// link buffer msg? to tell target of copy to add the buffer to the soecific window... which might not even exist...fuck
+
 } // namespace detail
 } // namespace offload
 } // namespace ham
diff --git a/src/inner_product.cpp b/src/inner_product.cpp
index f959eee..87b04db 100644
--- a/src/inner_product.cpp
+++ b/src/inner_product.cpp
@@ -56,10 +56,10 @@ int main(int argc, char* argv[])
 
 	// transfer data to the device (the target is implicitly specified by the destination buffer_ptr)	
 	auto future_a_put = offload::put(a.data(), a_target, n); // async
+    future_a_put.get();
     offload::put(b.data(), b_target, n); // sync (implicitly returned future performs synchronisation in dtor), alternative: put_sync()
 
     // synchronise
-    future_a_put.get();
 
 	std::cout << "completed put" << std::endl;
 
@@ -77,7 +77,10 @@ int main(int argc, char* argv[])
 
 	// output the result
 	std::cout << "Result: " << c << std::endl;
-	
+
+    MPI_Win_create_dynamic()
+
+
 	return 0;	
 }
 

From c2b18ae0a68f29258eadcc791effb3c6baa568d2 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Fri, 6 Apr 2018 15:47:57 +0200
Subject: [PATCH 020/150] changed to paiwise global dynamic windows

---
 .../ham/net/communicator_mpi_rma_dynamic.hpp  | 125 +++++++++---------
 include/ham/offload/offload.hpp               |   2 +-
 include/ham/offload/offload_msg.hpp           |   8 +-
 src/inner_product.cpp                         |   3 +-
 4 files changed, 70 insertions(+), 68 deletions(-)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index 1ef1320..1ec104a 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -95,7 +95,7 @@ class communicator {
 			HAM_DEBUG( HAM_LOG << "request::get(), after MPI_Waitall()" << std::endl; )
             if(uses_rma_)
             {
-                MPI_Win_unlock(target_node, communicator::instance().rma_win);
+                MPI_Win_unlock(target_node, communicator::instance().peers[target_node].rma_win);
             }
 			return static_cast<void*>(&communicator::instance().peers[target_node].msg_buffers[recv_buffer_index]);
 		}
@@ -194,43 +194,51 @@ class communicator {
 		MPI_Allgather(&node_description, sizeof(node_descriptor), MPI_BYTE, node_descriptions.data(), sizeof(node_descriptor), MPI_BYTE, MPI_COMM_WORLD);
 		HAM_DEBUG( HAM_LOG << "communicator::communicator(): gathering node descriptions done" << std::endl; )
 
-		// prepare global group to create pairwise groups
-		MPI_Comm_group(MPI_COMM_WORLD, &global_group);
 
-		if (is_host()) {
-
-			for (node_t i = 1; i < nodes_; ++i) { // TODO(improvement): needs to be changed when host-rank becomes configurable
-				// allocate buffers
-				peers[i].msg_buffers = allocate_peer_buffer<msg_buffer>(constants::MSG_BUFFERS, this_node_);
-				// fill resource pools
-				for(size_t j = constants::MSG_BUFFERS; j > 0; --j) {
-					peers[i].buffer_pool.add(j-1);
-				}
-
-				// init comm to target from pairwise subgroups
-				const int members[2] = {host_node_, i}; // NOTE: this implies new group rank is 0 for host, 1 for target
-				MPI_Group pairwise_group;
-				MPI_Group_incl(global_group, 2, members, &pairwise_group);
-				MPI_Comm_create_group(MPI_COMM_WORLD, pairwise_group, 0, &(peers[i].rma_comm));
-				MPI_Group_free(&pairwise_group); // no longer needed after COMM is created
-
-				// init win to target
-				MPI_Win_create_dynamic(MPI_INFO_NULL, peers[i].rma_comm, &(peers[i].rma_win));
-			}
+        if (is_host()) {
 
+            for (node_t i = 1; i < nodes_; ++i) { // TODO(improvement): needs to be changed when host-rank becomes configurable
+                // allocate buffers
+                peers[i].msg_buffers = allocate_peer_buffer<msg_buffer>(constants::MSG_BUFFERS, this_node_);
+                // fill resource pools
+                for (size_t j = constants::MSG_BUFFERS; j > 0; --j) {
+                    peers[i].buffer_pool.add(j - 1);
+                }
+            }
+        }
 
-		} else {
-			// init comm to host from pairwise subgroup
-			const int members[2] = {host_node_, this_node_}; // NOTE: this implies new group rank = 0 for host, 1 for target
-			MPI_Group pairwise_group;
-			MPI_Group_incl(global_group, 2, members, &pairwise_group); // should match the corresponding subgroup on host for i = this_node_
-			MPI_Comm_create_group(MPI_COMM_WORLD, pairwise_group, 0, &(peers[host_node_].rma_comm));
-			MPI_Group_free(&pairwise_group); // no longer needed after COMM is created
+        // initialise 1 global window per target
+        for (node_t i = 1; i < nodes_; ++i) {
+            MPI_Win_create_dynamic(MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[i].rma_win));
+        }
 
-			// init win to host
-			MPI_Win_create_dynamic(MPI_INFO_NULL, peers[host_node_].rma_comm, &(peers[host_node_].rma_win));
+        HAM_DEBUG( HAM_LOG << "communicator::communicator(): rma window creation done" << std::endl; )
+/* pairwise COMM stuff
+       // both
+                // prepare global group to create pairwise groups
+                MPI_Comm_group(MPI_COMM_WORLD, &global_group);
+       // host
+ 				// init comm to target from pairwise subgroups
+ 				const int members[2] = {host_node_, i}; // NOTE: this implies new group rank is 0 for host, 1 for target
+ 				MPI_Group pairwise_group;
+ 				MPI_Group_incl(global_group, 2, members, &pairwise_group);
+ 				MPI_Comm_create_group(MPI_COMM_WORLD, pairwise_group, 0, &(peers[i].rma_comm));
+ 				MPI_Group_free(&pairwise_group); // no longer needed after COMM is created
+
+ 				// init win to target
+ 				MPI_Win_create_dynamic(MPI_INFO_NULL, peers[i].rma_comm, &(peers[i].rma_win));
+       // targets
+ 			    // init comm to host from pairwise subgroup
+ 			    const int members[2] = {host_node_, this_node_}; // NOTE: this implies new group rank = 0 for host, 1 for target
+ 			    MPI_Group pairwise_group;
+ 			    MPI_Group_incl(global_group, 2, members, &pairwise_group); // should match the corresponding subgroup on host for i = this_node_
+ 			    MPI_Comm_create_group(MPI_COMM_WORLD, pairwise_group, 0, &(peers[host_node_].rma_comm));
+ 			    MPI_Group_free(&pairwise_group); // no longer needed after COMM is created
+
+ 			    // init win to host
+ 			    MPI_Win_create_dynamic(MPI_INFO_NULL, peers[host_node_].rma_comm, &(peers[host_node_].rma_win));
+ */
 
-		}
 	}
 
 	~communicator()
@@ -276,8 +284,8 @@ class communicator {
 	void* recv_msg_host(void* msg = nullptr, size_t size = constants::MSG_SIZE)
 	{
 		static msg_buffer buffer; // NOTE !
-		MPI_Recv(&buffer, size, MPI_BYTE, MPI_ANY_SOURCE, constants::DEFAULT_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE); // changed source from host_node_ to MPI_ANY_SOURCE so targets may react to request for setting up rma paths
-		return static_cast<void*>(&buffer);
+		MPI_Recv(&buffer, size, MPI_BYTE, host_node_, constants::DEFAULT_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+        return static_cast<void*>(&buffer);
 	}
 
 	// trigger receiving the result of a message on the sending side
@@ -295,17 +303,10 @@ class communicator {
 	template<typename T>
 	void send_data(T* local_source, buffer_ptr<T> remote_dest, size_t size)
 	{
-		// resolve rank for subgroup
-		int target_rank;
-		if(remote_dest.node() > this_node_) {
-			target_rank = 1;
-		} else {
-			target_rank = 0;
-		}
 		// execute transfer
-		MPI_Win_lock(MPI_LOCK_SHARED, target_rank, 0, peers[remote_dest.node()].rma_win);
-        MPI_Put(local_source, size * sizeof(T), MPI_BYTE, target_rank, remote_dest.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_dest.node()].rma_win);
-        MPI_Win_unlock(target_rank, peers[remote_dest.node()].rma_win);
+		MPI_Win_lock(MPI_LOCK_SHARED, remote_dest.node(), 0, peers[remote_dest.node()].rma_win);
+        MPI_Put(local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), remote_dest.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_dest.node()].rma_win);
+        MPI_Win_unlock(remote_dest.node(), peers[remote_dest.node()].rma_win);
 	}
 
 	// to be used by the host only
@@ -314,9 +315,8 @@ class communicator {
 	{
         req.uses_rma_ = true;
 
-		// resolving rank for subgroup not necessary, is always 1 for the target
-        MPI_Win_lock(MPI_LOCK_SHARED, 1, 0, peers[remote_dest.node()].rma_win);
-        MPI_Rput(local_source, size * sizeof(T), MPI_BYTE, 1, remote_dest.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_dest.node()].rma_win, &req.next_mpi_request());
+        MPI_Win_lock(MPI_LOCK_SHARED, remote_dest.node(), 0, peers[remote_dest.node()].rma_win);
+        MPI_Rput(local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), remote_dest.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_dest.node()].rma_win, &req.next_mpi_request());
 	}
 
 	// not used in MPI RMA backend
@@ -326,7 +326,7 @@ class communicator {
 	template<typename T>
 	void recv_data(buffer_ptr<T> remote_source, T* local_dest, size_t size)
 	{
-		MPI_Win_lock(MPI_LOCK_SHARED, 0, 0, peers[remote_source.node()].rma_win); // dummy rank number as if targets were to use recv_data from host
+		MPI_Win_lock(MPI_LOCK_SHARED, remote_source.node(), 0, peers[remote_source.node()].rma_win);
 		MPI_Get(remote_source, size * sizeof(T), MPI_BYTE, remote_source.node(), remote_dest.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_source.node()].rma_win);
 		MPI_Win_unlock(remote_source.node(), peers[remote_source.node()].rma_win);
 	}
@@ -337,9 +337,8 @@ class communicator {
 	{
         req.uses_rma_ = true;
 
-		// resolving rank for subgroup not necessary, is always 1 for the target
-		MPI_Win_lock(MPI_LOCK_SHARED, 1, 0, peers[remote_source.node()].rma_win);
-		MPI_Rget(local_dest, size * sizeof(T), MPI_BYTE, 1, remote_source.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_source.node()].rma_win, &req.next_mpi_request());
+		MPI_Win_lock(MPI_LOCK_SHARED, remote_source.node(), 0, peers[remote_source.node()].rma_win);
+		MPI_Rget(local_dest, size * sizeof(T), MPI_BYTE, remote_source.node(), remote_source.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_source.node()].rma_win, &req.next_mpi_request());
 	}
 
 	template<typename T>
@@ -348,7 +347,10 @@ class communicator {
 		T* ptr;
 		//int err =
 		posix_memalign((void**)&ptr, constants::CACHE_LINE_SIZE, n * sizeof(T));
-		MPI_Win_attach(peers[source_node].rma_win, (void*)ptr, n * sizeof(T)); // only attach to the window corresponding to the requesting node, is attached to potential target-target-windows on demand
+        // attach to all windows
+        for (node_t i = 1; i < nodes_; ++i) {
+            MPI_Win_attach(peers[i].rma_win, (void*)ptr, n * sizeof(T));
+        }
 		MPI_Aint mpi_address;
 		MPI_Get_address((void*)ptr, &mpi_address);
 		// NOTE: no ctor is called
@@ -366,18 +368,19 @@ class communicator {
 		return buffer_ptr<T>(ptr, this_node_);
 	}
 
-	// for host to free peer message buffers, needed because original function now manages rma window which must not happen for host-only local buffers
 	template<typename T>
 	void free_buffer(buffer_ptr<T> ptr)
 	{
 		assert(ptr.node() == this_node_);
 		// NOTE: no dtor is called
-
-		// remove from all potential rma windows
-        MPI_Win_detach(rma_win, ptr.get());
+		// remove from all rma windows
+        for (node_t i = 1; i < nodes_; ++i) {
+            MPI_Win_detach(peers[i].rma_win, ptr.get());
+        }
 		free(static_cast<void*>(ptr.get()));
 	}
 
+    // for host to free peer message buffers, needed because original function now manages rma window which must not happen for host-only local buffers
 	template<typename T>
 	void free_peer_buffer(buffer_ptr<T> ptr)
 	{
@@ -397,12 +400,14 @@ class communicator {
 		return instance().node_descriptions[node];
 	}
 
+/*
 	// called to check if an rma path between two targets exists, sufficient to call on one of the two targets
 	bool has_rma_path(node_t target_node) {
 		// check if copy path exists
 		return !peers[remote_dest.node()].rma_win;
 	}
-
+*/
+/*
 	// called to establish an rma path between two targets for copy operations, needs to be called on both sides
 	void establish_rma_path(node_t target_node) {
 		if(!has_rma_path(target_node)) { // make sure there is not already an rma path
@@ -423,7 +428,7 @@ class communicator {
 			MPI_Win_create_dynamic(MPI_INFO_NULL, peers[target_node].rma_comm, &(peers[target_node].rma_win));
 		}
 	}
-
+*/
 
 private:
 	static communicator* instance_;
@@ -431,7 +436,6 @@ class communicator {
 	size_t nodes_;
 	node_t host_node_;
 	std::vector<node_descriptor> node_descriptions; // not as member in peer below, because Allgather is used to exchange node descriptions
-	MPI_Group global_group;
 
 	struct mpi_peer {
 		buffer_ptr<msg_buffer> msg_buffers; // buffers used for MPI_ISend and IRecv by the sender
@@ -442,7 +446,6 @@ class communicator {
 
 		// mpi rma dynamic window
 		MPI_Win rma_win;
-		MPI_Comm rma_comm;
 	};
 	
 	mpi_peer* peers;
diff --git a/include/ham/offload/offload.hpp b/include/ham/offload/offload.hpp
index b44451e..1c2e78c 100644
--- a/include/ham/offload/offload.hpp
+++ b/include/ham/offload/offload.hpp
@@ -309,7 +309,7 @@ void get_sync(buffer_ptr<T> remote_source, T* local_dest, size_t n)
 
 //}
 
-#ifdef HAM_COMM_MPI_RMA_DYNAMIC // compile-integration pending
+#ifdef HAM_COMM_MPI_RMA_DYNAMIC
         template<typename T>
 future<void> copy(buffer_ptr<T> source, buffer_ptr<T> dest, size_t n)
 {
diff --git a/include/ham/offload/offload_msg.hpp b/include/ham/offload/offload_msg.hpp
index e9f75b0..cb8a5a8 100644
--- a/include/ham/offload/offload_msg.hpp
+++ b/include/ham/offload/offload_msg.hpp
@@ -145,9 +145,9 @@ class offload_read_msg
 
         void operator()() //const
         {
-            communicator::instance().establish_rma_path(remote_node); // should quickly return if path already exists
+        /*   communicator::instance().establish_rma_path(remote_node); // should quickly return if path already exists
             // attach existing buffers to new target window ?!?
-
+        */
             communicator::instance().send_data(local_source, buffer_ptr<T>(nullptr, remote_node, remote_addr), n);
 
             // send a result message to tell the sender, that the transfer is done
@@ -165,6 +165,7 @@ class offload_read_msg
     };
 //#endif
 
+/*
 // allows user to setup an rma link between two targets without a copy transfer
 #ifdef HAM_COMM_MPI_RMA_DYNAMIC
     template<typename T, template<class> class ExecutionPolicy = default_execution_policy>
@@ -188,8 +189,7 @@ class offload_read_msg
         node_t remote_node;
     };
 #endif
-
-// link buffer msg? to tell target of copy to add the buffer to the soecific window... which might not even exist...fuck
+*/
 
 } // namespace detail
 } // namespace offload
diff --git a/src/inner_product.cpp b/src/inner_product.cpp
index 87b04db..7ad0f18 100644
--- a/src/inner_product.cpp
+++ b/src/inner_product.cpp
@@ -77,8 +77,7 @@ int main(int argc, char* argv[])
 
 	// output the result
 	std::cout << "Result: " << c << std::endl;
-
-    MPI_Win_create_dynamic()
+    
 
 
 	return 0;	

From b6d15c8e6ae52f694a2a987e5cbcf42149c74d37 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Fri, 6 Apr 2018 16:01:49 +0200
Subject: [PATCH 021/150] fixed Jamroot

---
 Jamroot | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/Jamroot b/Jamroot
index 4fb6664..6f6dbb2 100644
--- a/Jamroot
+++ b/Jamroot
@@ -56,10 +56,10 @@ obj offload_obj_mpi : ham/offload/offload.cpp : <library>/mpi//mpi <define>HAM_C
 
 constant OBJ_FILES_MPI : communicator_obj_mpi runtime_obj_mpi offload_obj_mpi communicator_mpi_obj_mpi ;
 
-obj communicator_obj_mpi_rma_dyn : ham/net/communicator.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC <define>HAM_DEBUG_ON ;
-obj communicator_mpi_rma_dyn_obj_mpi_rma_dyn : ham/net/communicator_mpi_rma_dynamic.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC <define>HAM_DEBUG_ON ;
-obj runtime_obj_mpi_rma_dyn : ham/offload/runtime.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC <define>HAM_DEBUG_ON ;
-obj offload_obj_mpi_rma_dyn : ham/offload/offload.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC <define>HAM_DEBUG_ON ;
+obj communicator_obj_mpi_rma_dyn : ham/net/communicator.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC ;
+obj communicator_mpi_rma_dyn_obj_mpi_rma_dyn : ham/net/communicator_mpi_rma_dynamic.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC ;
+obj runtime_obj_mpi_rma_dyn : ham/offload/runtime.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC ;
+obj offload_obj_mpi_rma_dyn : ham/offload/offload.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC ;
 
 constant OBJ_FILES_MPI_RMA_DYN : communicator_obj_mpi_rma_dyn communicator_mpi_rma_dyn_obj_mpi_rma_dyn runtime_obj_mpi_rma_dyn offload_obj_mpi_rma_dyn ;
 
@@ -73,7 +73,7 @@ constant OBJ_FILES_MPI_RMA_DYN : communicator_obj_mpi_rma_dyn communicator_mpi_r
 # Libraries
 
 obj main_obj_mpi : ham/offload/main.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI ;
-obj main_obj_mpi_rma_dyn : ham/offload/main.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC <define>HAM_DEBUG_ON ;
+obj main_obj_mpi_rma_dyn : ham/offload/main.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC ;
 # obj main_obj_scif : ham/offload/main.cpp : <library>scif <define>HAM_COMM_SCIF ;
 
 lib ham_offload_mpi
@@ -83,7 +83,7 @@ lib ham_offload_mpi
 
 lib ham_offload_mpi_rma_dyn
     : $(OBJ_FILES_COMMON) $(OBJ_FILES_MPI_RMA_DYN) main_obj_mpi_rma_dyn boost_program_options
-    : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC <define>HAM_DEBUG_ON
+    : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC
     ;
 
 # lib ham_offload_scif
@@ -92,7 +92,7 @@ lib ham_offload_mpi_rma_dyn
 #	;
 
 obj main_explicit_obj_mpi : ham/offload/main_explicit.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI ;
-obj main_explicit_obj_mpi_rma_dyn : ham/offload/main_explicit.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC <define>HAM_DEBUG_ON ;
+obj main_explicit_obj_mpi_rma_dyn : ham/offload/main_explicit.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC ;
 # obj main_explicit_obj_scif : ham/offload/main_explicit.cpp : <library>scif <define>HAM_COMM_SCIF ;
 
 lib ham_offload_mpi_explicit
@@ -102,7 +102,7 @@ lib ham_offload_mpi_explicit
 
 lib ham_offload_mpi_rma_dyn_explicit
     : $(OBJ_FILES_COMMON) $(OBJ_FILES_MPI_RMA_DYN) main_explicit_obj_mpi_rma_dyn boost_program_options
-    : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC <define>HAM_EXPLICIT <define>HAM_DEBUG_ON
+    : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC <define>HAM_EXPLICIT
     ;
 
 # lib ham_offload_scif_explicit
@@ -118,10 +118,10 @@ exe benchmark_ham_offload_mpi
 	: <library>/mpi//mpi <library>ham_offload_mpi
 	;	
 
-obj benchmark_ham_offload_mpi_rma_dyn_obj : benchmark_ham_offload.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC <define>HAM_DEBUG_ON ;
+obj benchmark_ham_offload_mpi_rma_dyn_obj : benchmark_ham_offload.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC ;
 exe benchmark_ham_offload_mpi_rma_dyn
     : benchmark_ham_offload_mpi_rma_dyn_obj boost_program_options
-    : <library>/mpi//mpi <library>ham_offload_mpi_rma_dyn <define>HAM_DEBUG_ON
+    : <library>/mpi//mpi <library>ham_offload_mpi_rma_dyn
     ;
 
 # obj benchmark_ham_offload_scif_obj : benchmark_ham_offload.cpp : <library>scif <define>HAM_COMM_SCIF ;
@@ -155,7 +155,7 @@ exe ham_offload
 exe ham_offload_explicit
 	: ham_offload_explicit.cpp ham_offload_mpi_rma_dyn_explicit boost_program_options
 #	: <library>/mpi//mpi <define>HAM_COMM_MPI
-	: <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC <define>HAM_DEBUG_ON
+	: <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC
 #	: <library>scif <define>HAM_COMM_SCIF
 	;
 
@@ -171,7 +171,7 @@ exe inner_product_mpi
 
 exe inner_product_mpi_rma_dynamic
     : [ obj inner_product_obj : inner_product.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC ] ham_offload_mpi_rma_dyn boost_program_options
-    : <library>/mpi//mpi <cflags>-g <define>HAM_COMM_MPI_RMA_DYNAMIC <define>HAM_DEBUG_ON
+    : <library>/mpi//mpi <cflags>-g <define>HAM_COMM_MPI_RMA_DYNAMIC
     ;
 
 # exe test_data_transfer_scif
@@ -186,7 +186,7 @@ exe test_data_transfer_mpi
 
 exe test_data_transfer_mpi_rma_dynamic
 	: [ obj test_data_transfer_obj : test_data_transfer.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC ] ham_offload_mpi_rma_dyn boost_program_options
-	: <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC <define>HAM_DEBUG_ON
+	: <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC
 	;
 
 # exe test_argument_transfer_scif
@@ -201,7 +201,7 @@ exe test_argument_transfer_mpi
 
 exe test_argument_transfer_mpi_rma_dynamic
 	: [ obj test_argument_transfer_obj : test_argument_transfer.cpp : <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC ] ham_offload_mpi_rma_dyn boost_program_options
-	: <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC <define>HAM_DEBUG_ON
+	: <library>/mpi//mpi <define>HAM_COMM_MPI_RMA_DYNAMIC
 	;
 
 # Explicit targets (not built by default)

From 5279080f8c8fd5fc5d0f0929917e295acf51f50e Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Fri, 6 Apr 2018 16:53:42 +0200
Subject: [PATCH 022/150] fixed error in unused function

---
 Jamroot                                          | 5 +++--
 include/ham/net/communicator_mpi_rma_dynamic.hpp | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/Jamroot b/Jamroot
index 6f6dbb2..f832e4e 100644
--- a/Jamroot
+++ b/Jamroot
@@ -32,14 +32,15 @@ project HAM
 	<include>$(INC)
 	<include>$(BOOST_PATH)/include
 	<variant>debug:<define>HAM_DEBUG_ON
-	<variant>debug_mic:<define>HAM_DEBUG_ON 
+#	<variant>debug_mic:<define>HAM_DEBUG_ON 
 #	<toolset>intel:<cflags>"-static-intel"
 	<inlining>on # off, on, full
 	#<optimization>speed # off, speed, space
 	<cxxflags>"-hstd=c++11"
 	<threading>multi
 #	<link>static
-	: default-build debug release debug_mic release_mic
+	: default-build release
+#	: default-build debug release debug_mic release_mic
 	;
 
 # Object files that are compiled the same for all targets
diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index 1ec104a..6e7f318 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -327,7 +327,7 @@ class communicator {
 	void recv_data(buffer_ptr<T> remote_source, T* local_dest, size_t size)
 	{
 		MPI_Win_lock(MPI_LOCK_SHARED, remote_source.node(), 0, peers[remote_source.node()].rma_win);
-		MPI_Get(remote_source, size * sizeof(T), MPI_BYTE, remote_source.node(), remote_dest.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_source.node()].rma_win);
+		MPI_Get(remote_source, size * sizeof(T), MPI_BYTE, remote_source.node(), remote_source.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_source.node()].rma_win);
 		MPI_Win_unlock(remote_source.node(), peers[remote_source.node()].rma_win);
 	}
 	

From bf0dfa536c44ca1abf9eb1a023557b529ae26d52 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Sat, 7 Apr 2018 18:57:04 +0200
Subject: [PATCH 023/150] made host permanently lock all windows

---
 .../ham/net/communicator_mpi_rma_dynamic.hpp  | 23 +++++++++++++------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index 6e7f318..954d504 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -95,7 +95,7 @@ class communicator {
 			HAM_DEBUG( HAM_LOG << "request::get(), after MPI_Waitall()" << std::endl; )
             if(uses_rma_)
             {
-                MPI_Win_unlock(target_node, communicator::instance().peers[target_node].rma_win);
+                MPI_Win_flush(target_node, communicator::instance().peers[target_node].rma_win);
             }
 			return static_cast<void*>(&communicator::instance().peers[target_node].msg_buffers[recv_buffer_index]);
 		}
@@ -212,6 +212,13 @@ class communicator {
             MPI_Win_create_dynamic(MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[i].rma_win));
         }
 
+		// get all locks to targets
+		if (is_host()) {
+			for (node_t i = 1; i < nodes_; ++i) { // TODO(improvement): needs to be changed when host-rank becomes configurable
+				MPI_Win_lock(MPI_LOCK_SHARED, i, 0, peers[i].rma_win);
+			}
+		}
+
         HAM_DEBUG( HAM_LOG << "communicator::communicator(): rma window creation done" << std::endl; )
 /* pairwise COMM stuff
        // both
@@ -304,9 +311,10 @@ class communicator {
 	void send_data(T* local_source, buffer_ptr<T> remote_dest, size_t size)
 	{
 		// execute transfer
-		MPI_Win_lock(MPI_LOCK_SHARED, remote_dest.node(), 0, peers[remote_dest.node()].rma_win);
+		// MPI_Win_lock(MPI_LOCK_SHARED, remote_dest.node(), 0, peers[remote_dest.node()].rma_win);
         MPI_Put(local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), remote_dest.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_dest.node()].rma_win);
-        MPI_Win_unlock(remote_dest.node(), peers[remote_dest.node()].rma_win);
+        MPI_Win_flush(remote_dest.node(), peers[remote_dest.node()].rma_win);
+		// MPI_Win_unlock(remote_dest.node(), peers[remote_dest.node()].rma_win);
 	}
 
 	// to be used by the host only
@@ -315,7 +323,7 @@ class communicator {
 	{
         req.uses_rma_ = true;
 
-        MPI_Win_lock(MPI_LOCK_SHARED, remote_dest.node(), 0, peers[remote_dest.node()].rma_win);
+        // MPI_Win_lock(MPI_LOCK_SHARED, remote_dest.node(), 0, peers[remote_dest.node()].rma_win);
         MPI_Rput(local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), remote_dest.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_dest.node()].rma_win, &req.next_mpi_request());
 	}
 
@@ -326,9 +334,10 @@ class communicator {
 	template<typename T>
 	void recv_data(buffer_ptr<T> remote_source, T* local_dest, size_t size)
 	{
-		MPI_Win_lock(MPI_LOCK_SHARED, remote_source.node(), 0, peers[remote_source.node()].rma_win);
+		// MPI_Win_lock(MPI_LOCK_SHARED, remote_source.node(), 0, peers[remote_source.node()].rma_win);
 		MPI_Get(remote_source, size * sizeof(T), MPI_BYTE, remote_source.node(), remote_source.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_source.node()].rma_win);
-		MPI_Win_unlock(remote_source.node(), peers[remote_source.node()].rma_win);
+		MPI_Win_flush(remote_source.node(), peers[remote_source.node()].rma_win);
+		// MPI_Win_unlock(remote_source.node(), peers[remote_source.node()].rma_win);
 	}
 	
 	// to be used by the host
@@ -337,7 +346,7 @@ class communicator {
 	{
         req.uses_rma_ = true;
 
-		MPI_Win_lock(MPI_LOCK_SHARED, remote_source.node(), 0, peers[remote_source.node()].rma_win);
+		// MPI_Win_lock(MPI_LOCK_SHARED, remote_source.node(), 0, peers[remote_source.node()].rma_win);
 		MPI_Rget(local_dest, size * sizeof(T), MPI_BYTE, remote_source.node(), remote_source.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_source.node()].rma_win, &req.next_mpi_request());
 	}
 

From 4e2c5de2b6df268bcebb98b82add21a542bda4ac Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Sat, 7 Apr 2018 19:16:20 +0200
Subject: [PATCH 024/150] fixed send_data() for target-target-copy

---
 include/ham/net/communicator_mpi_rma_dynamic.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index 954d504..3922e04 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -215,7 +215,7 @@ class communicator {
 		// get all locks to targets
 		if (is_host()) {
 			for (node_t i = 1; i < nodes_; ++i) { // TODO(improvement): needs to be changed when host-rank becomes configurable
-				MPI_Win_lock(MPI_LOCK_SHARED, i, 0, peers[i].rma_win);
+				MPI_Win_lock(MPI_LOCK_SHARED, i, 0, peers[i].rma_win);  // shared locks so host won't need to unlock for target-target-copy
 			}
 		}
 
@@ -311,10 +311,10 @@ class communicator {
 	void send_data(T* local_source, buffer_ptr<T> remote_dest, size_t size)
 	{
 		// execute transfer
-		// MPI_Win_lock(MPI_LOCK_SHARED, remote_dest.node(), 0, peers[remote_dest.node()].rma_win);
+		MPI_Win_lock(MPI_LOCK_SHARED, remote_dest.node(), 0, peers[remote_dest.node()].rma_win);
         MPI_Put(local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), remote_dest.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_dest.node()].rma_win);
-        MPI_Win_flush(remote_dest.node(), peers[remote_dest.node()].rma_win);
-		// MPI_Win_unlock(remote_dest.node(), peers[remote_dest.node()].rma_win);
+        // MPI_Win_flush(remote_dest.node(), peers[remote_dest.node()].rma_win);
+		MPI_Win_unlock(remote_dest.node(), peers[remote_dest.node()].rma_win);
 	}
 
 	// to be used by the host only

From 292369ff5096e54ac86b765e0b80238b6db27063 Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Thu, 12 Apr 2018 15:49:21 +0200
Subject: [PATCH 025/150] changed benchmark for new backend

---
 src/benchmark_ham_offload.cpp | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/src/benchmark_ham_offload.cpp b/src/benchmark_ham_offload.cpp
index 90a0f64..3e55ec7 100644
--- a/src/benchmark_ham_offload.cpp
+++ b/src/benchmark_ham_offload.cpp
@@ -10,6 +10,7 @@
 #include <string>
 #include <sstream>
 #include <stdlib.h> // posix_memalign
+//#include <ham/net/communicator_mpi.hpp>
 
 #include "ham/util/time.hpp"
 
@@ -124,6 +125,7 @@ int main(int argc, char * argv[])
 		("allocate,a", boost::program_options::value<bool>()->zero_tokens(), "benchmark memory allocation/deallocation on target")
 		("copy-in,i", boost::program_options::value<bool>()->zero_tokens(), "benchmark data copy to target")
 		("copy-out,o", boost::program_options::value<bool>()->zero_tokens(), "benchmark data copy from target")
+		("copy-direct,d", boost::program_options::value<bool>()->zero_tokens(), "benchmark data copy from target to another target")
 		("call,c", boost::program_options::value<bool>()->zero_tokens(), "benchmark function call on target")
 		("call-mul,m", boost::program_options::value<bool>()->zero_tokens(), "benchmark function call (multiplication) on target")
 		("async,y", boost::program_options::value<bool>()->zero_tokens(), "perform benchmark function calls asynchronously")
@@ -157,6 +159,11 @@ int main(int argc, char * argv[])
 		std::cout << "# COMM_MPI                     enabled" << std::endl;
 	#else
 		std::cout << "# COMM_MPI                     disabled" << std::endl;
+    #endif
+	#ifdef HAM_COMM_MPI_RMA_DYNAMIC
+		std::cout << "# COMM_MPI_RMA_DYNAMIC         enabled" << std::endl;
+	#else
+		std::cout << "# COMM_MPI_RMA_DYNAMIC         disabled" << std::endl;
 	#endif
 
 #ifdef HAM_COMM_SCIF
@@ -258,6 +265,29 @@ int main(int argc, char * argv[])
 		copy_out_time.to_file(filename + "copy_out_time");
 	}
 
+	if (vm.count("copy-direct"))
+	{
+		// first allocate memory
+		offload::buffer_ptr<char> remote_source = offload::allocate<char>(1, data_size);
+		offload::buffer_ptr<char> remote_target = offload::allocate<char>(2, data_size);
+		statistics copy_direct_time(runs, warmup_runs);
+
+		for (size_t i = 0; i < (runs + warmup_runs); ++i)
+		{
+			timer clock;
+			offload_copy_direct(remote_source, remote_target, data_size);
+			copy_direct_time.add(clock);
+		}
+		// free memory
+		offload_free(remote_source);
+		offload_free(remote_target);
+
+		cout << "HAM-Offload copy-direct time: " << endl
+			 << header_string_data << endl
+			 << "copy-direct:\t" << copy_direct_time.string() << "\t" << data_size << endl;
+		copy_direct_time.to_file(filename + "copy_direct_time");
+	}
+
 	if (vm.count("call"))
 	{
 		statistics call_time(runs, warmup_runs);

From 3a16160d5b402dda174f7716ece146711341b87a Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Thu, 12 Apr 2018 18:03:45 +0200
Subject: [PATCH 026/150] added all ranks permanent window locks, only attach
 buffers to own window

---
 .../ham/net/communicator_mpi_rma_dynamic.hpp  | 30 +++++++++++--------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index 3922e04..61f1f27 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -213,11 +213,13 @@ class communicator {
         }
 
 		// get all locks to targets
-		if (is_host()) {
-			for (node_t i = 1; i < nodes_; ++i) { // TODO(improvement): needs to be changed when host-rank becomes configurable
-				MPI_Win_lock(MPI_LOCK_SHARED, i, 0, peers[i].rma_win);  // shared locks so host won't need to unlock for target-target-copy
-			}
-		}
+        // targets lock to other targets for copies
+        for (node_t i = 1; i < nodes_; ++i) { // TODO(improvement): needs to be changed when host-rank becomes configurable
+            if(i != this_node_) {
+                MPI_Win_lock(MPI_LOCK_SHARED, i, 0, peers[i].rma_win);  // shared locks because all ranks lock on every target concurrently
+            }
+        }
+
 
         HAM_DEBUG( HAM_LOG << "communicator::communicator(): rma window creation done" << std::endl; )
 /* pairwise COMM stuff
@@ -311,10 +313,10 @@ class communicator {
 	void send_data(T* local_source, buffer_ptr<T> remote_dest, size_t size)
 	{
 		// execute transfer
-		MPI_Win_lock(MPI_LOCK_SHARED, remote_dest.node(), 0, peers[remote_dest.node()].rma_win);
+		// MPI_Win_lock(MPI_LOCK_SHARED, remote_dest.node(), 0, peers[remote_dest.node()].rma_win); // not needed since all ranks have locks on all targets
         MPI_Put(local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), remote_dest.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_dest.node()].rma_win);
         // MPI_Win_flush(remote_dest.node(), peers[remote_dest.node()].rma_win);
-		MPI_Win_unlock(remote_dest.node(), peers[remote_dest.node()].rma_win);
+		// MPI_Win_unlock(remote_dest.node(), peers[remote_dest.node()].rma_win);
 	}
 
 	// to be used by the host only
@@ -356,10 +358,11 @@ class communicator {
 		T* ptr;
 		//int err =
 		posix_memalign((void**)&ptr, constants::CACHE_LINE_SIZE, n * sizeof(T));
-        // attach to all windows
-        for (node_t i = 1; i < nodes_; ++i) {
+        // attach to own window
+        MPI_Win_attach(peers[this_node_].rma_win, (void*)ptr, n * sizeof(T));
+        /* for (node_t i = 1; i < nodes_; ++i) { // nonsense, all accesses to a rank will only take place on that targets window, no need to attach to other
             MPI_Win_attach(peers[i].rma_win, (void*)ptr, n * sizeof(T));
-        }
+        } */
 		MPI_Aint mpi_address;
 		MPI_Get_address((void*)ptr, &mpi_address);
 		// NOTE: no ctor is called
@@ -382,10 +385,11 @@ class communicator {
 	{
 		assert(ptr.node() == this_node_);
 		// NOTE: no dtor is called
-		// remove from all rma windows
-        for (node_t i = 1; i < nodes_; ++i) {
+        // remove from own rma window
+        MPI_Win_detach(peers[this_node_].rma_win, ptr.get());
+        /* for (node_t i = 1; i < nodes_; ++i) { // nonsense, all accesses to a rank will only take place on that targets window, no need to attach to other
             MPI_Win_detach(peers[i].rma_win, ptr.get());
-        }
+        } */
 		free(static_cast<void*>(ptr.get()));
 	}
 

From fd3627706d7c1f44e2012a041aa1c05aa38ce873 Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Thu, 12 Apr 2018 18:32:53 +0200
Subject: [PATCH 027/150] fixed missing flush for copy

---
 include/ham/net/communicator_mpi_rma_dynamic.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index 61f1f27..ea50eff 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -315,7 +315,7 @@ class communicator {
 		// execute transfer
 		// MPI_Win_lock(MPI_LOCK_SHARED, remote_dest.node(), 0, peers[remote_dest.node()].rma_win); // not needed since all ranks have locks on all targets
         MPI_Put(local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), remote_dest.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_dest.node()].rma_win);
-        // MPI_Win_flush(remote_dest.node(), peers[remote_dest.node()].rma_win);
+        MPI_Win_flush(remote_dest.node(), peers[remote_dest.node()].rma_win);
 		// MPI_Win_unlock(remote_dest.node(), peers[remote_dest.node()].rma_win);
 	}
 

From 3b22ec1b0695e19977662d9e08b2dca16a609b47 Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Fri, 20 Apr 2018 17:33:09 +0200
Subject: [PATCH 028/150] initial commit of truly one-sided rma backend

---
 .../ham/net/communicator_mpi_rma_dynamic.hpp  | 299 ++++++++++++++----
 include/ham/offload/offload_msg.hpp           |   4 +-
 2 files changed, 239 insertions(+), 64 deletions(-)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index ea50eff..2e29c0c 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -17,6 +17,7 @@
 #include "ham/misc/types.hpp"
 #include "ham/util/debug.hpp"
 #include "ham/util/log.hpp"
+#include "communicator.hpp"
 
 namespace ham {
 namespace net {
@@ -64,40 +65,52 @@ class node_descriptor
 
 class communicator {
 public:
-	// externally used interface of request must be shared across all communicator-implementations
+	enum {
+        NO_BUFFER_INDEX = constants::MSG_BUFFERS, // invalid buffer index (max valid + 1)
+        FLAG_FALSE = constants::MSG_BUFFERS + 1 // special value, outside normal index range
+    };
+
+    // externally used interface of request must be shared across all communicator-implementations
 	class request {
 	public:
 		request() : valid_(false) {} // instantiate invalid
 		
-		request(node_t target_node, node_t source_node, size_t send_buffer_index, size_t recv_buffer_index)
-		 : target_node(target_node), source_node(source_node), valid_(true), send_buffer_index(send_buffer_index), recv_buffer_index(recv_buffer_index), req_count(0), uses_rma_(false)
+		request(node_t target_node, node_t source_node, size_t remote_buffer_index, size_t local_buffer_index)
+		 : target_node(target_node), source_node(source_node), valid_(true), remote_buffer_index(remote_buffer_index), local_buffer_index(local_buffer_index), req_count(0), uses_rma_(false)
 		{}
 
 		// return true if request was finished
         // will not work as intended for rma ops, no equivalent to test() available for remote completion
 		bool test()
 		{
-			int flag = 0;
-			MPI_Testall(req_count, mpi_reqs, &flag, MPI_STATUS_IGNORE); // just test the receive request, since the send belonging to the request triggers the remote send that is received
+			// int flag = 0;
+
+            // MPI_Testall(req_count, mpi_reqs, &flag, MPI_STATUS_IGNORE); // just test the receive request, since the send belonging to the request triggers the remote send that is received
 
+            /*
             if(uses_rma_)
             {
                 HAM_DEBUG( HAM_LOG << "request::test(), warning: may give false positive on rma remote completion" << std::endl; )
             }
 
             return flag != 0;
+            */
+            return communicator::instance().test_local_flag(target_node, local_buffer_index);
 		}
 
 		void* get() // blocks
 		{
-			HAM_DEBUG( HAM_LOG << "request::get(), before MPI_Waitall()" << std::endl; )
+            /*
+            HAM_DEBUG( HAM_LOG << "request::get(), before MPI_Waitall()" << std::endl; )
 			MPI_Waitall(req_count, mpi_reqs, MPI_STATUS_IGNORE); // must wait for all requests to satisfy the standard
 			HAM_DEBUG( HAM_LOG << "request::get(), after MPI_Waitall()" << std::endl; )
             if(uses_rma_)
             {
-                MPI_Win_flush(target_node, communicator::instance().peers[target_node].rma_win);
+                MPI_Win_flush(target_node, communicator::instance().peers[target_node].rma_data_win);
             }
 			return static_cast<void*>(&communicator::instance().peers[target_node].msg_buffers[recv_buffer_index]);
+            */
+            return communicator::instance().recv_msg(target_node, local_buffer_index);
 		}
 
 		template<class T>
@@ -106,8 +119,8 @@ class communicator {
 			assert(communicator::this_node() == target_node); // this assert fails if send_result is called from the wrong side
 			
 			// TODO(improvement, low priority): better go through communicator, such that no MPI calls are anywhere else
-			MPI_Send(result_msg, size, MPI_BYTE, source_node, constants::RESULT_TAG, MPI_COMM_WORLD);
-			//communicator::instance().send_msg(source_node, source_buffer_index, NO_BUFFER_INDEX, result_msg, size);
+			// MPI_Send(result_msg, size, MPI_BYTE, source_node, constants::RESULT_TAG, MPI_COMM_WORLD);
+			communicator::instance().send_msg(source_node, local_buffer_index, NO_BUFFER_INDEX, result_msg, size);
 		}
 
 		bool valid() const
@@ -135,8 +148,8 @@ class communicator {
 		// only needed by the sender
 		enum { NUM_REQUESTS = 3 };
 		
-		size_t send_buffer_index; // buffer to use for sending the message
-		size_t recv_buffer_index; // buffer to use for receiving the result
+		size_t remote_buffer_index; // buffer to use for sending the message
+		size_t local_buffer_index; // buffer to use for receiving the result
 		size_t req_count;
 		
 	private:
@@ -194,7 +207,7 @@ class communicator {
 		MPI_Allgather(&node_description, sizeof(node_descriptor), MPI_BYTE, node_descriptions.data(), sizeof(node_descriptor), MPI_BYTE, MPI_COMM_WORLD);
 		HAM_DEBUG( HAM_LOG << "communicator::communicator(): gathering node descriptions done" << std::endl; )
 
-
+        /*
         if (is_host()) {
 
             for (node_t i = 1; i < nodes_; ++i) { // TODO(improvement): needs to be changed when host-rank becomes configurable
@@ -205,18 +218,76 @@ class communicator {
                     peers[i].buffer_pool.add(j - 1);
                 }
             }
-        }
+        }*/
+
+        // initialise all windows
+        for (node_t i = 0; i < nodes_; ++i) {
+            // dynamic data window
+            MPI_Win_create_dynamic(MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[i].rma_data_win));
+
+            if (i == this_node_) { // create local windows with allocated memory for targets, host creates one inbound set of windows for all targets
+                // allocate memory
+                if (this_node_ == host_node_) {
+                    // MSG_SIZE/FLAG_SIZE * MSG_BUFFERS * num_nodes for host
+                    peers[this_node_].msg_data = allocate_peer_buffer<msg_buffer>(constants::MSG_BUFFERS * nodes_, this_node_);
+                    peers[this_node_].flag_data = allocate_peer_buffer<cache_line_buffer>(constants::MSG_BUFFERS * nodes_, this_node_);
+                    // fill resource pools
+                    for (size_t j = 0; j < nodes_; ++j) {
+                        for (size_t k = constants::MSG_BUFFERS; k > 0; --k) {
+                            peers[j].local_buffer_pool.add(k - 1);
+                            peers[j].remote_buffer_pool.add(k - 1);
+                        }
+                        // allocate first next_request,
+                        allocate_next_request(j);
+                    }
+                } else {
+                    // MSG_SIZE/FLAG_SIZE * MSG_BUFFERS for targets
+                    peers[this_node_].msg_data = allocate_peer_buffer<msg_buffer>(constants::MSG_BUFFERS, this_node_);
+                    peers[this_node_].flag_data = allocate_peer_buffer<cache_line_buffer>(constants::MSG_BUFFERS, this_node_);
+                }
 
-        // initialise 1 global window per target
-        for (node_t i = 1; i < nodes_; ++i) {
-            MPI_Win_create_dynamic(MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[i].rma_win));
+                // create windows
+                MPI_Win_create(&(peers[this_node_].msg_data), sizeof(msg_buffer) * constants::MSG_BUFFERS * nodes_, MPI_BYTE, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[this_node_].msg_win));
+                MPI_Win_create(&(peers[this_node_].flag_data), sizeof(cache_line_buffer) * constants::MSG_BUFFERS * nodes_, MPI_BYTE, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[this_node_].flag_win));
+
+            } else { //create remote windows without memory
+                void* dump;
+                MPI_Win_create(dump, 0, MPI_BYTE, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[i].msg_win));
+                MPI_Win_create(dump, 0, MPI_BYTE, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[i].flag_win));
+
+                //MPI_Win_allocate(0, 1, MPI_INFO_NULL, MPI_COMM_WORLD, peers[i].msg_win_data, &(peers[i].rma_msg_win));
+                //MPI_Win_allocate(0, 1, MPI_INFO_NULL, MPI_COMM_WORLD, peers[i].flag_win_data, &(peers[i].rma_flag_win));
+            }
         }
+/*
+        // initialise all windows for target -> host
+        for (node_t i = 1; i < nodes_; ++i) {
+            if (is_host()) {
+                // create local wins with memory for all targets
+                // allocate memory
+
+
+                // create window
+                MPI_Win_create(memptr, SIZE, MPI_BYTE, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[i].local_msg_win));
+                MPI_Win_create(memptr, SIZE, MPI_BYTE, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[i].local_flag_win));
 
+            } else {
+                // create remote wins without memory for host
+                if (i == this_node_) {
+                    MPI_Win_create(memptr, 0, MPI_BYTE, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[host_node_].local_msg_win));
+                    MPI_Win_create(memptr, 0, MPI_BYTE, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[host_node_].local_msg_win));
+
+                }
+            }
+        }
+*/
 		// get all locks to targets
         // targets lock to other targets for copies
-        for (node_t i = 1; i < nodes_; ++i) { // TODO(improvement): needs to be changed when host-rank becomes configurable
-            if(i != this_node_) {
-                MPI_Win_lock(MPI_LOCK_SHARED, i, 0, peers[i].rma_win);  // shared locks because all ranks lock on every target concurrently
+        for (node_t i = 0; i < nodes_; ++i) { // TODO(improvement): needs to be changed when host-rank becomes configurable
+            if (i != this_node_) {
+                MPI_Win_lock(MPI_LOCK_SHARED, i, 0, peers[i].rma_data_win);  // shared locks because all ranks lock on every target concurrently
+                MPI_Win_lock(MPI_LOCK_SHARED, i, 0, peers[i].msg_win);  // shared locks because all ranks lock on every target concurrently
+                MPI_Win_lock(MPI_LOCK_SHARED, i, 0, peers[i].flag_win);  // shared locks because all ranks lock on every target concurrently
             }
         }
 
@@ -235,7 +306,7 @@ class communicator {
  				MPI_Group_free(&pairwise_group); // no longer needed after COMM is created
 
  				// init win to target
- 				MPI_Win_create_dynamic(MPI_INFO_NULL, peers[i].rma_comm, &(peers[i].rma_win));
+ 				MPI_Win_create_dynamic(MPI_INFO_NULL, peers[i].rma_comm, &(peers[i].rma_data_win));
        // targets
  			    // init comm to host from pairwise subgroup
  			    const int members[2] = {host_node_, this_node_}; // NOTE: this implies new group rank = 0 for host, 1 for target
@@ -245,7 +316,7 @@ class communicator {
  			    MPI_Group_free(&pairwise_group); // no longer needed after COMM is created
 
  			    // init win to host
- 			    MPI_Win_create_dynamic(MPI_INFO_NULL, peers[host_node_].rma_comm, &(peers[host_node_].rma_win));
+ 			    MPI_Win_create_dynamic(MPI_INFO_NULL, peers[host_node_].rma_comm, &(peers[host_node_].rma_data_win));
  */
 
 	}
@@ -256,17 +327,33 @@ class communicator {
 		HAM_DEBUG( HAM_LOG << "~communicator" << std::endl; )
 	}
 
-
+    // this is only used by the host
+    const request& allocate_next_request(node_t remote_node)
+    {
+        // this allocates a host-managed index for the remote nodes msg and flag buffers
+        // so the host knows which buffers are available on the target
+        const size_t remote_buffer_index = peers[remote_node].remote_buffer_pool.allocate();
+        // this allocates an index for the hosts large msg and flag buffers
+        // request is included in offload message, so target knows into which buffers answers must be written
+        // when used, the index will need to be added to an offset determined by a targets rank to address the part of the buffer belonging to this target
+        // NOTE: the actual host buffer is stored at the hosts peers[0], but the buffer_pools are stored at the corresponding peers[target]
+        // buffer_pools manage idices within the targets section of the hosts buffer
+        const size_t local_buffer_index = peers[remote_node].local_buffer_pool.allocate();
+
+        peers[remote_node].next_request = { remote_node, this_node_, remote_buffer_index, local_buffer_index};
+
+        return peers[remote_node].next_request;
+    }
+
+    // only used by host
 	request allocate_request(node_t remote_node)
 	{
-		HAM_DEBUG( HAM_LOG << "communicator::allocate_next_request(): remote_node = " << remote_node << std::endl; )
+        HAM_DEBUG( HAM_LOG << "communicator::allocate_next_request(): remote_node = " << remote_node << std::endl; )
 
-		const size_t send_buffer_index = peers[remote_node].buffer_pool.allocate();
-		const size_t recv_buffer_index = peers[remote_node].buffer_pool.allocate();
-
-		return { remote_node, this_node_, send_buffer_index, recv_buffer_index };
+		return peers[remote_node].next_request;
 	}
 
+    // only used by host
 	void free_request(request& req)
 	{
 		assert(req.valid());
@@ -274,27 +361,89 @@ class communicator {
 	
 		mpi_peer& peer = peers[req.target_node];
 
-		peer.buffer_pool.free(req.send_buffer_index);
-		peer.buffer_pool.free(req.recv_buffer_index);
+        // set flags to false
+        // local flag inside large host flag buffer @ peers[host]
+        // index offset computed using target node
+        size_t offset = sizeof(cache_line_buffer) * constants::MSG_BUFFERS * req.target_node;
+        volatile size_t* local_flag = reinterpret_cast<size_t*>(&peers[host_node_].flag_data.get()[offset + req.local_buffer_index]);
+        *local_flag= FLAG_FALSE;
+        // remote flag on target
+        size_t remote_flag = FLAG_FALSE;
+        MPI_Put(&remote_flag, 1, MPI_INT64_T, req.target_node, 0, 1, MPI_INT64_T, peer.flag_win);
+        // flush? don't think so
+
+		peer.remote_buffer_pool.free(req.remote_buffer_index);
+		peer.local_buffer_pool.free(req.local_buffer_index);
+
 		req.valid_ = false;
 	}
 
 public:
+    // make private?!
+    // only called by host
+    // called by func below
+    void send_msg(node_t node, size_t buffer_index, size_t next_buffer_index, void* msg, size_t size) {
+        // write msg to target msg buffer
+        MPI_Put(msg, size, MPI_BYTE, node, buffer_index, size, MPI_BYTE, peers[node].msg_win);
+
+        // TODO DANIEL: because MPI does not guarantee order on RMA ops, there might be a FLUSH necessary here
+
+        // write flag to target flags buffer
+        // not sure on the size here?
+        MPI_Put(&next_buffer_index, 1, MPI_INT64_T, node, buffer_index, 1, MPI_INT64_T, peers[node].flag_win);
+    }
+
+    // only called by host
 	void send_msg(request_reference_type req, void* msg, size_t size)
 	{
-		// copy message from caller into transfer buffer
+		/*
+        // copy message from caller into transfer buffer
 		void* msg_buffer = static_cast<void*>(&peers[req.target_node].msg_buffers[req.send_buffer_index]);
 		memcpy(msg_buffer, msg, size);
 		MPI_Isend(msg_buffer, size, MPI_BYTE, req.target_node, constants::DEFAULT_TAG, MPI_COMM_WORLD, &req.next_mpi_request());
-	}
-	
+	    */
+
+        const request& next_req = allocate_next_request(req.target_node); // allocate_next_req needed??
+        send_msg(req.target_node, req.remote_buffer_index, next_req.remote_buffer_index, msg, size);
+    }
+
+    // make private?!
+    // called by function below
+    void * recv_msg(node_t node, size_t buffer_index = NO_BUFFER_INDEX, void* msg = nullptr, size_t size = constants::MSG_SIZE)
+    {
+        buffer_index = buffer_index == NO_BUFFER_INDEX ? peers[node].next_flag : buffer_index;
+
+        volatile size_t* local_flag;
+
+        if (this_node_ == host_node_) {
+            size_t offset = sizeof(cache_line_buffer) * constants::MSG_BUFFERS * node;
+            local_flag = reinterpret_cast<size_t*>(&peers[host_node_].flag_data.get()[offset + buffer_index]);
+        } else {
+            local_flag = reinterpret_cast<size_t*>(&peers[node].flag_data.get()[buffer_index]);
+        }
+
+
+        while (*local_flag == FLAG_FALSE); // poll on flag for completion
+
+        if (*local_flag != NO_BUFFER_INDEX) // the flag contains the next buffer index to poll on
+            peers[node].next_flag = *local_flag;
+
+        if (this_node_ == host_node_) {
+            size_t offset = sizeof(msg_buffer) * constants::MSG_BUFFERS * node;
+            return &peers[host_node_].msg_data.get()[offset + buffer_index];
+        } else {
+            return &peers[node].msg_data.get()[buffer_index];
+        }
+    }
+
 	// to be used by the offload target's main loop: synchronously receive one message at a time
 	// NOTE: the local static receive buffer!
 	void* recv_msg_host(void* msg = nullptr, size_t size = constants::MSG_SIZE)
 	{
-		static msg_buffer buffer; // NOTE !
+		/* static msg_buffer buffer; // NOTE !
 		MPI_Recv(&buffer, size, MPI_BYTE, host_node_, constants::DEFAULT_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-        return static_cast<void*>(&buffer);
+        return static_cast<void*>(&buffer); */
+        return recv_msg(host_node_, NO_BUFFER_INDEX, msg, size);
 	}
 
 	// trigger receiving the result of a message on the sending side
@@ -302,10 +451,17 @@ class communicator {
 	{
 		// nothing todo here, since this communicator implementation uses one-sided communication
 		// the data is already where it is expected (in the buffer referenced in req)
-		MPI_Irecv(static_cast<void*>(&peers[req.target_node].msg_buffers[req.recv_buffer_index]), constants::MSG_SIZE, MPI_BYTE, req.target_node, constants::RESULT_TAG, MPI_COMM_WORLD, &req.next_mpi_request());
+
+        // MPI_Irecv(static_cast<void*>(&peers[req.target_node].msg_buffers[req.recv_buffer_index]), constants::MSG_SIZE, MPI_BYTE, req.target_node, constants::RESULT_TAG, MPI_COMM_WORLD, &req.next_mpi_request());
 		return;
 	}
 
+    bool test_local_flag(node_t node, size_t buffer_index)
+    {
+        volatile size_t * local_flag = reinterpret_cast<size_t*>(&peers[node].flag_data.get()[buffer_index]);
+        return *local_flag != FLAG_FALSE;
+    }
+
 	// in MPI RMA backend only used by copy
 	// host uses async version
 	// targets don't send data to host as host uses rma get
@@ -313,10 +469,10 @@ class communicator {
 	void send_data(T* local_source, buffer_ptr<T> remote_dest, size_t size)
 	{
 		// execute transfer
-		// MPI_Win_lock(MPI_LOCK_SHARED, remote_dest.node(), 0, peers[remote_dest.node()].rma_win); // not needed since all ranks have locks on all targets
-        MPI_Put(local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), remote_dest.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_dest.node()].rma_win);
-        MPI_Win_flush(remote_dest.node(), peers[remote_dest.node()].rma_win);
-		// MPI_Win_unlock(remote_dest.node(), peers[remote_dest.node()].rma_win);
+		// MPI_Win_lock(MPI_LOCK_SHARED, remote_dest.node(), 0, peers[remote_dest.node()].rma_data_win); // not needed since all ranks have locks on all targets
+        MPI_Put(local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), remote_dest.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_dest.node()].rma_data_win);
+        MPI_Win_flush(remote_dest.node(), peers[remote_dest.node()].rma_data_win);
+		// MPI_Win_unlock(remote_dest.node(), peers[remote_dest.node()].rma_data_win);
 	}
 
 	// to be used by the host only
@@ -325,8 +481,8 @@ class communicator {
 	{
         req.uses_rma_ = true;
 
-        // MPI_Win_lock(MPI_LOCK_SHARED, remote_dest.node(), 0, peers[remote_dest.node()].rma_win);
-        MPI_Rput(local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), remote_dest.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_dest.node()].rma_win, &req.next_mpi_request());
+        // MPI_Win_lock(MPI_LOCK_SHARED, remote_dest.node(), 0, peers[remote_dest.node()].rma_data_win);
+        MPI_Rput(local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), remote_dest.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_dest.node()].rma_data_win, &req.next_mpi_request());
 	}
 
 	// not used in MPI RMA backend
@@ -336,10 +492,10 @@ class communicator {
 	template<typename T>
 	void recv_data(buffer_ptr<T> remote_source, T* local_dest, size_t size)
 	{
-		// MPI_Win_lock(MPI_LOCK_SHARED, remote_source.node(), 0, peers[remote_source.node()].rma_win);
-		MPI_Get(remote_source, size * sizeof(T), MPI_BYTE, remote_source.node(), remote_source.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_source.node()].rma_win);
-		MPI_Win_flush(remote_source.node(), peers[remote_source.node()].rma_win);
-		// MPI_Win_unlock(remote_source.node(), peers[remote_source.node()].rma_win);
+		// MPI_Win_lock(MPI_LOCK_SHARED, remote_source.node(), 0, peers[remote_source.node()].rma_data_win);
+		MPI_Get(local_dest, size * sizeof(T), MPI_BYTE, remote_source.node(), remote_source.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_source.node()].rma_data_win);
+		MPI_Win_flush(remote_source.node(), peers[remote_source.node()].rma_data_win);
+		// MPI_Win_unlock(remote_source.node(), peers[remote_source.node()].rma_data_win);
 	}
 	
 	// to be used by the host
@@ -348,8 +504,8 @@ class communicator {
 	{
         req.uses_rma_ = true;
 
-		// MPI_Win_lock(MPI_LOCK_SHARED, remote_source.node(), 0, peers[remote_source.node()].rma_win);
-		MPI_Rget(local_dest, size * sizeof(T), MPI_BYTE, remote_source.node(), remote_source.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_source.node()].rma_win, &req.next_mpi_request());
+		// MPI_Win_lock(MPI_LOCK_SHARED, remote_source.node(), 0, peers[remote_source.node()].rma_data_win);
+		MPI_Rget(local_dest, size * sizeof(T), MPI_BYTE, remote_source.node(), remote_source.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_source.node()].rma_data_win, &req.next_mpi_request());
 	}
 
 	template<typename T>
@@ -359,9 +515,9 @@ class communicator {
 		//int err =
 		posix_memalign((void**)&ptr, constants::CACHE_LINE_SIZE, n * sizeof(T));
         // attach to own window
-        MPI_Win_attach(peers[this_node_].rma_win, (void*)ptr, n * sizeof(T));
+        MPI_Win_attach(peers[this_node_].rma_data_win, (void*)ptr, n * sizeof(T));
         /* for (node_t i = 1; i < nodes_; ++i) { // nonsense, all accesses to a rank will only take place on that targets window, no need to attach to other
-            MPI_Win_attach(peers[i].rma_win, (void*)ptr, n * sizeof(T));
+            MPI_Win_attach(peers[i].rma_data_win, (void*)ptr, n * sizeof(T));
         } */
 		MPI_Aint mpi_address;
 		MPI_Get_address((void*)ptr, &mpi_address);
@@ -369,26 +525,29 @@ class communicator {
 		return buffer_ptr<T>(ptr, this_node_, mpi_address);
 	}
 
-	// for host to allocate peer message buffers, needed because original function now manages rma window which must not happen for host-only local buffers
+	// for host to allocate peer message buffers, needed because original function now manages dynamic window for data buffers
 	template<typename T>
 	buffer_ptr<T> allocate_peer_buffer(const size_t n, node_t source_node)
 	{
-		T* ptr;
+        // TODO DANIEL: this is where mem is allocated that should be mapped to static mpi windows
+        T* ptr;
 		//int err =
 		posix_memalign((void**)&ptr, constants::CACHE_LINE_SIZE, n * sizeof(T));
 		// NOTE: no ctor is called
 		return buffer_ptr<T>(ptr, this_node_);
+
 	}
 
+    // used for data buffers only
 	template<typename T>
 	void free_buffer(buffer_ptr<T> ptr)
 	{
 		assert(ptr.node() == this_node_);
 		// NOTE: no dtor is called
         // remove from own rma window
-        MPI_Win_detach(peers[this_node_].rma_win, ptr.get());
+        MPI_Win_detach(peers[this_node_].rma_data_win, ptr.get());
         /* for (node_t i = 1; i < nodes_; ++i) { // nonsense, all accesses to a rank will only take place on that targets window, no need to attach to other
-            MPI_Win_detach(peers[i].rma_win, ptr.get());
+            MPI_Win_detach(peers[i].rma_data_win, ptr.get());
         } */
 		free(static_cast<void*>(ptr.get()));
 	}
@@ -397,6 +556,8 @@ class communicator {
 	template<typename T>
 	void free_peer_buffer(buffer_ptr<T> ptr)
 	{
+        // TODO DANIEL: this is where mem is freed that should be mapped to static mpi windows
+        // i dont think this is ever called on the actual memory mapped to static mpi windows, freeing it would equal "disconnecting" corresponding target
 		assert(ptr.node() == this_node_);
 		// NOTE: no dtor is called
 		free(static_cast<void*>(ptr.get()));
@@ -417,7 +578,7 @@ class communicator {
 	// called to check if an rma path between two targets exists, sufficient to call on one of the two targets
 	bool has_rma_path(node_t target_node) {
 		// check if copy path exists
-		return !peers[remote_dest.node()].rma_win;
+		return !peers[remote_dest.node()].rma_data_win;
 	}
 */
 /*
@@ -438,7 +599,7 @@ class communicator {
 			MPI_Group_incl(global_group, 2, members, &pairwise_group);
 			MPI_Comm_create_group(MPI_COMM_WORLD, pairwise_group, 0, &(peers[target_node].rma_comm));
 			MPI_Group_free(&pairwise_group); // no longer needed after COMM is created
-			MPI_Win_create_dynamic(MPI_INFO_NULL, peers[target_node].rma_comm, &(peers[target_node].rma_win));
+			MPI_Win_create_dynamic(MPI_INFO_NULL, peers[target_node].rma_comm, &(peers[target_node].rma_data_win));
 		}
 	}
 */
@@ -450,19 +611,33 @@ class communicator {
 	node_t host_node_;
 	std::vector<node_descriptor> node_descriptions; // not as member in peer below, because Allgather is used to exchange node descriptions
 
-	struct mpi_peer {
-		buffer_ptr<msg_buffer> msg_buffers; // buffers used for MPI_ISend and IRecv by the sender
+    struct mpi_peer {
+		buffer_ptr<msg_buffer> msg_buffers; // buffers used for MPI_ISend and IRecv by the sender // buffers used for MPI_RPut and RGet
 
 		// needed by sender to manage which buffers are in use and which are free
 		// just manages indices, that can be used by
-		detail::resource_pool<size_t> buffer_pool;
+		detail::resource_pool<size_t> local_buffer_pool;
+        detail::resource_pool<size_t> remote_buffer_pool;
+
+        request next_request;
+        size_t next_flag = 0;
 
-		// mpi rma dynamic window
-		MPI_Win rma_win;
+        // NOTE: behind these buffers are MSG_BUFFERS many buffers of size MSG_SIZE/CACHE_LINE_SIZE, indices are managed by buffer_pool
+
+        // static window for inbound rma messages
+        buffer_ptr<msg_buffer> msg_data;
+        MPI_Win msg_win;
+        // static window for inbound message flags
+        buffer_ptr<cache_line_buffer> flag_data;
+        MPI_Win flag_win;
+
+		// mpi rma dynamic window for data
+		MPI_Win rma_data_win;
 	};
-	
+
+
 	mpi_peer* peers;
-};
+    };
 
 template<typename T>
 buffer_ptr<T>::buffer_ptr() : buffer_ptr(nullptr, communicator::this_node()) { }
diff --git a/include/ham/offload/offload_msg.hpp b/include/ham/offload/offload_msg.hpp
index cb8a5a8..97c5e95 100644
--- a/include/ham/offload/offload_msg.hpp
+++ b/include/ham/offload/offload_msg.hpp
@@ -134,7 +134,7 @@ class offload_read_msg
 	size_t n;
 };
 
-//#ifdef HAM_COMM_MPI_RMA_DYNAMIC
+#ifdef HAM_COMM_MPI_RMA_DYNAMIC
     template<typename T, template<class> class ExecutionPolicy = default_execution_policy>
     class offload_rma_copy_msg
             : public active_msg<offload_rma_copy_msg<T, ExecutionPolicy>, ExecutionPolicy>
@@ -163,7 +163,7 @@ class offload_read_msg
         T* local_source;
         size_t n;
     };
-//#endif
+#endif
 
 /*
 // allows user to setup an rma link between two targets without a copy transfer

From cb3246e6e652c629e3a475fad189e1d25c2edaab Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Mon, 23 Apr 2018 14:06:41 +0200
Subject: [PATCH 029/150] fixed recv_msg checking flag buffer @ wrong peer

---
 include/ham/net/communicator_mpi_rma_dynamic.hpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index 2e29c0c..8449a6d 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -380,20 +380,18 @@ class communicator {
 
 public:
     // make private?!
-    // only called by host
     // called by func below
     void send_msg(node_t node, size_t buffer_index, size_t next_buffer_index, void* msg, size_t size) {
         // write msg to target msg buffer
         MPI_Put(msg, size, MPI_BYTE, node, buffer_index, size, MPI_BYTE, peers[node].msg_win);
 
         // TODO DANIEL: because MPI does not guarantee order on RMA ops, there might be a FLUSH necessary here
-
+        MPI_Win_flush(node ,peers[node].msg_win);
         // write flag to target flags buffer
         // not sure on the size here?
         MPI_Put(&next_buffer_index, 1, MPI_INT64_T, node, buffer_index, 1, MPI_INT64_T, peers[node].flag_win);
     }
 
-    // only called by host
 	void send_msg(request_reference_type req, void* msg, size_t size)
 	{
 		/*
@@ -419,7 +417,7 @@ class communicator {
             size_t offset = sizeof(cache_line_buffer) * constants::MSG_BUFFERS * node;
             local_flag = reinterpret_cast<size_t*>(&peers[host_node_].flag_data.get()[offset + buffer_index]);
         } else {
-            local_flag = reinterpret_cast<size_t*>(&peers[node].flag_data.get()[buffer_index]);
+            local_flag = reinterpret_cast<size_t*>(&peers[this_node_].flag_data.get()[buffer_index]);
         }
 
 

From 8b0c0e31f2108eb0f38970c9193948e8dc49abe6 Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Mon, 23 Apr 2018 14:14:09 +0200
Subject: [PATCH 030/150] fixed recv_msg returning wrong peer buffer

---
 include/ham/net/communicator_mpi_rma_dynamic.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index 8449a6d..fa87caa 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -430,7 +430,7 @@ class communicator {
             size_t offset = sizeof(msg_buffer) * constants::MSG_BUFFERS * node;
             return &peers[host_node_].msg_data.get()[offset + buffer_index];
         } else {
-            return &peers[node].msg_data.get()[buffer_index];
+            return &peers[this_node_].msg_data.get()[buffer_index];
         }
     }
 

From 72f9f232b3509955851e54912b1123e930af139e Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Mon, 23 Apr 2018 14:46:33 +0200
Subject: [PATCH 031/150] fixed wrong offset computation for host buffers

---
 include/ham/net/communicator_mpi_rma_dynamic.hpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index fa87caa..89acb24 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -234,6 +234,7 @@ class communicator {
                     // fill resource pools
                     for (size_t j = 0; j < nodes_; ++j) {
                         for (size_t k = constants::MSG_BUFFERS; k > 0; --k) {
+                            // target buffers
                             peers[j].local_buffer_pool.add(k - 1);
                             peers[j].remote_buffer_pool.add(k - 1);
                         }
@@ -364,7 +365,7 @@ class communicator {
         // set flags to false
         // local flag inside large host flag buffer @ peers[host]
         // index offset computed using target node
-        size_t offset = sizeof(cache_line_buffer) * constants::MSG_BUFFERS * req.target_node;
+        size_t offset = constants::MSG_BUFFERS * req.target_node;
         volatile size_t* local_flag = reinterpret_cast<size_t*>(&peers[host_node_].flag_data.get()[offset + req.local_buffer_index]);
         *local_flag= FLAG_FALSE;
         // remote flag on target
@@ -414,7 +415,7 @@ class communicator {
         volatile size_t* local_flag;
 
         if (this_node_ == host_node_) {
-            size_t offset = sizeof(cache_line_buffer) * constants::MSG_BUFFERS * node;
+            size_t offset = constants::MSG_BUFFERS * node;
             local_flag = reinterpret_cast<size_t*>(&peers[host_node_].flag_data.get()[offset + buffer_index]);
         } else {
             local_flag = reinterpret_cast<size_t*>(&peers[this_node_].flag_data.get()[buffer_index]);
@@ -427,7 +428,7 @@ class communicator {
             peers[node].next_flag = *local_flag;
 
         if (this_node_ == host_node_) {
-            size_t offset = sizeof(msg_buffer) * constants::MSG_BUFFERS * node;
+            size_t offset = constants::MSG_BUFFERS * node;
             return &peers[host_node_].msg_data.get()[offset + buffer_index];
         } else {
             return &peers[this_node_].msg_data.get()[buffer_index];

From 85d19896b86adb20ce66e5f23951c76856d2a26d Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Mon, 23 Apr 2018 15:56:21 +0200
Subject: [PATCH 032/150] added flag buffer init with FLAG_FALSE

---
 include/ham/net/communicator_mpi_rma_dynamic.hpp | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index 89acb24..2af3512 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -231,6 +231,7 @@ class communicator {
                     // MSG_SIZE/FLAG_SIZE * MSG_BUFFERS * num_nodes for host
                     peers[this_node_].msg_data = allocate_peer_buffer<msg_buffer>(constants::MSG_BUFFERS * nodes_, this_node_);
                     peers[this_node_].flag_data = allocate_peer_buffer<cache_line_buffer>(constants::MSG_BUFFERS * nodes_, this_node_);
+                    reset_flags(peers[this_node_].flag_data);
                     // fill resource pools
                     for (size_t j = 0; j < nodes_; ++j) {
                         for (size_t k = constants::MSG_BUFFERS; k > 0; --k) {
@@ -245,6 +246,7 @@ class communicator {
                     // MSG_SIZE/FLAG_SIZE * MSG_BUFFERS for targets
                     peers[this_node_].msg_data = allocate_peer_buffer<msg_buffer>(constants::MSG_BUFFERS, this_node_);
                     peers[this_node_].flag_data = allocate_peer_buffer<cache_line_buffer>(constants::MSG_BUFFERS, this_node_);
+                    reset_flags(peers[this_node_].flag_data);
                 }
 
                 // create windows
@@ -461,6 +463,18 @@ class communicator {
         return *local_flag != FLAG_FALSE;
     }
 
+    void reset_flags(buffer_ptr<cache_line_buffer> flags)
+	{
+		cache_line_buffer fill_value;
+		cache_line_buffer* fill_value_ptr = &fill_value;
+		// null fill_value
+		std::fill(reinterpret_cast<unsigned char*>(fill_value_ptr), reinterpret_cast<unsigned char*>(fill_value_ptr) + sizeof(cache_line_buffer), 0);
+		// set to flag false
+		*reinterpret_cast<size_t*>(fill_value_ptr) = FLAG_FALSE;
+		// set all flags to fill_value
+		std::fill(flags.get(), flags.get() + constants::MSG_BUFFERS, fill_value);
+	}
+
 	// in MPI RMA backend only used by copy
 	// host uses async version
 	// targets don't send data to host as host uses rma get

From 304b71d678a70851567c7bfcc4e5f3703eaa4ef7 Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Mon, 23 Apr 2018 17:43:37 +0200
Subject: [PATCH 033/150] fixed in-buffer addressing

---
 .../ham/net/communicator_mpi_rma_dynamic.hpp  | 21 ++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index 2af3512..b0b7865 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -386,13 +386,20 @@ class communicator {
     // called by func below
     void send_msg(node_t node, size_t buffer_index, size_t next_buffer_index, void* msg, size_t size) {
         // write msg to target msg buffer
-        MPI_Put(msg, size, MPI_BYTE, node, buffer_index, size, MPI_BYTE, peers[node].msg_win);
-
-        // TODO DANIEL: because MPI does not guarantee order on RMA ops, there might be a FLUSH necessary here
-        MPI_Win_flush(node ,peers[node].msg_win);
-        // write flag to target flags buffer
-        // not sure on the size here?
-        MPI_Put(&next_buffer_index, 1, MPI_INT64_T, node, buffer_index, 1, MPI_INT64_T, peers[node].flag_win);
+        if (node != host_node_) { // to targets
+            MPI_Put(msg, size, MPI_BYTE, node, sizeof(msg_buffer) * buffer_index, size, MPI_BYTE, peers[node].msg_win);
+
+            // TODO DANIEL: because MPI does not guarantee order on RMA ops, there might be a FLUSH necessary here
+            MPI_Win_flush(node, peers[node].msg_win);
+            // write flag to target flags buffer
+            // not sure on the size here?
+            MPI_Put(&next_buffer_index, sizeof(cache_line_buffer), MPI_BYTE, node, sizeof(cache_line_buffer) * buffer_index, sizeof(cache_line_buffer), MPI_BYTE, peers[node].flag_win);
+        } else { // to host, used by send_result
+            size_t offset = constants::MSG_BUFFERS * this_node_;
+            MPI_Put(msg, size, MPI_BYTE, node, sizeof(msg_buffer) * (offset + buffer_index), size, MPI_BYTE, peers[node].msg_win);
+            MPI_Win_flush(node, peers[node].msg_win);
+            MPI_Put(&next_buffer_index, sizeof(cache_line_buffer), MPI_BYTE, node, sizeof(cache_line_buffer) * (offset + buffer_index), sizeof(cache_line_buffer), MPI_BYTE, peers[node].flag_win);
+        }
     }
 
 	void send_msg(request_reference_type req, void* msg, size_t size)

From 24fba45c7f736d40f3ff1e3ed835789b98b95e59 Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Mon, 23 Apr 2018 18:18:11 +0200
Subject: [PATCH 034/150] fixed flag init for large host buffer

---
 include/ham/net/communicator_mpi_rma_dynamic.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index b0b7865..daee37d 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -231,7 +231,7 @@ class communicator {
                     // MSG_SIZE/FLAG_SIZE * MSG_BUFFERS * num_nodes for host
                     peers[this_node_].msg_data = allocate_peer_buffer<msg_buffer>(constants::MSG_BUFFERS * nodes_, this_node_);
                     peers[this_node_].flag_data = allocate_peer_buffer<cache_line_buffer>(constants::MSG_BUFFERS * nodes_, this_node_);
-                    reset_flags(peers[this_node_].flag_data);
+                    reset_flags(peers[this_node_].flag_data, constants::MSG_BUFFERS * nodes_);
                     // fill resource pools
                     for (size_t j = 0; j < nodes_; ++j) {
                         for (size_t k = constants::MSG_BUFFERS; k > 0; --k) {
@@ -246,7 +246,7 @@ class communicator {
                     // MSG_SIZE/FLAG_SIZE * MSG_BUFFERS for targets
                     peers[this_node_].msg_data = allocate_peer_buffer<msg_buffer>(constants::MSG_BUFFERS, this_node_);
                     peers[this_node_].flag_data = allocate_peer_buffer<cache_line_buffer>(constants::MSG_BUFFERS, this_node_);
-                    reset_flags(peers[this_node_].flag_data);
+                    reset_flags(peers[this_node_].flag_data, constants::MSG_BUFFERS);
                 }
 
                 // create windows
@@ -470,7 +470,7 @@ class communicator {
         return *local_flag != FLAG_FALSE;
     }
 
-    void reset_flags(buffer_ptr<cache_line_buffer> flags)
+    void reset_flags(buffer_ptr<cache_line_buffer> flags, size_t size)
 	{
 		cache_line_buffer fill_value;
 		cache_line_buffer* fill_value_ptr = &fill_value;
@@ -479,7 +479,7 @@ class communicator {
 		// set to flag false
 		*reinterpret_cast<size_t*>(fill_value_ptr) = FLAG_FALSE;
 		// set all flags to fill_value
-		std::fill(flags.get(), flags.get() + constants::MSG_BUFFERS, fill_value);
+		std::fill(flags.get(), flags.get() + size, fill_value);
 	}
 
 	// in MPI RMA backend only used by copy

From fdcd288ea281def509901dbd2c58439a2db1516a Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Tue, 24 Apr 2018 14:08:46 +0200
Subject: [PATCH 035/150] fixed window creation buffer pointer

---
 include/ham/net/communicator_mpi_rma_dynamic.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index daee37d..943822f 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -250,8 +250,8 @@ class communicator {
                 }
 
                 // create windows
-                MPI_Win_create(&(peers[this_node_].msg_data), sizeof(msg_buffer) * constants::MSG_BUFFERS * nodes_, MPI_BYTE, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[this_node_].msg_win));
-                MPI_Win_create(&(peers[this_node_].flag_data), sizeof(cache_line_buffer) * constants::MSG_BUFFERS * nodes_, MPI_BYTE, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[this_node_].flag_win));
+                MPI_Win_create((peers[this_node_].msg_data.get()), sizeof(msg_buffer) * constants::MSG_BUFFERS * nodes_, MPI_BYTE, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[this_node_].msg_win));
+                MPI_Win_create((peers[this_node_].flag_data.get()), sizeof(cache_line_buffer) * constants::MSG_BUFFERS * nodes_, MPI_BYTE, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[this_node_].flag_win));
 
             } else { //create remote windows without memory
                 void* dump;

From 6de2bfd49214ca34bc309d0296759759baa5162f Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Tue, 24 Apr 2018 14:21:43 +0200
Subject: [PATCH 036/150] fixed size for put of flag

---
 include/ham/net/communicator_mpi_rma_dynamic.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index 943822f..86a8a38 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -393,12 +393,12 @@ class communicator {
             MPI_Win_flush(node, peers[node].msg_win);
             // write flag to target flags buffer
             // not sure on the size here?
-            MPI_Put(&next_buffer_index, sizeof(cache_line_buffer), MPI_BYTE, node, sizeof(cache_line_buffer) * buffer_index, sizeof(cache_line_buffer), MPI_BYTE, peers[node].flag_win);
+            MPI_Put(&next_buffer_index, sizeof(next_buffer_index), MPI_BYTE, node, sizeof(cache_line_buffer) * buffer_index, sizeof(next_buffer_index), MPI_BYTE, peers[node].flag_win);
         } else { // to host, used by send_result
             size_t offset = constants::MSG_BUFFERS * this_node_;
             MPI_Put(msg, size, MPI_BYTE, node, sizeof(msg_buffer) * (offset + buffer_index), size, MPI_BYTE, peers[node].msg_win);
             MPI_Win_flush(node, peers[node].msg_win);
-            MPI_Put(&next_buffer_index, sizeof(cache_line_buffer), MPI_BYTE, node, sizeof(cache_line_buffer) * (offset + buffer_index), sizeof(cache_line_buffer), MPI_BYTE, peers[node].flag_win);
+            MPI_Put(&next_buffer_index, sizeof(next_buffer_index), MPI_BYTE, node, sizeof(cache_line_buffer) * (offset + buffer_index), sizeof(next_buffer_index), MPI_BYTE, peers[node].flag_win);
         }
     }
 

From e0cc1afb3d1bb93709767d301af7f5e106d28832 Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Tue, 24 Apr 2018 16:29:52 +0200
Subject: [PATCH 037/150] fixed displacement on host window

---
 include/ham/net/communicator_mpi_rma_dynamic.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index 86a8a38..bde7d13 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -250,13 +250,13 @@ class communicator {
                 }
 
                 // create windows
-                MPI_Win_create((peers[this_node_].msg_data.get()), sizeof(msg_buffer) * constants::MSG_BUFFERS * nodes_, MPI_BYTE, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[this_node_].msg_win));
-                MPI_Win_create((peers[this_node_].flag_data.get()), sizeof(cache_line_buffer) * constants::MSG_BUFFERS * nodes_, MPI_BYTE, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[this_node_].flag_win));
+                MPI_Win_create((peers[this_node_].msg_data.get()), sizeof(msg_buffer) * constants::MSG_BUFFERS * nodes_, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[this_node_].msg_win));
+                MPI_Win_create((peers[this_node_].flag_data.get()), sizeof(cache_line_buffer) * constants::MSG_BUFFERS * nodes_, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[this_node_].flag_win));
 
             } else { //create remote windows without memory
                 void* dump;
-                MPI_Win_create(dump, 0, MPI_BYTE, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[i].msg_win));
-                MPI_Win_create(dump, 0, MPI_BYTE, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[i].flag_win));
+                MPI_Win_create(dump, 0, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[i].msg_win));
+                MPI_Win_create(dump, 0, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[i].flag_win));
 
                 //MPI_Win_allocate(0, 1, MPI_INFO_NULL, MPI_COMM_WORLD, peers[i].msg_win_data, &(peers[i].rma_msg_win));
                 //MPI_Win_allocate(0, 1, MPI_INFO_NULL, MPI_COMM_WORLD, peers[i].flag_win_data, &(peers[i].rma_flag_win));

From 339486d0a02c110741c64f5b6af519774b9248b3 Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Tue, 24 Apr 2018 17:53:17 +0200
Subject: [PATCH 038/150] fixed waiting for reply msg for data transfers

---
 include/ham/net/communicator_mpi_rma_dynamic.hpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index bde7d13..20e8e99 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -100,17 +100,19 @@ class communicator {
 
 		void* get() // blocks
 		{
-            /*
+
             HAM_DEBUG( HAM_LOG << "request::get(), before MPI_Waitall()" << std::endl; )
 			MPI_Waitall(req_count, mpi_reqs, MPI_STATUS_IGNORE); // must wait for all requests to satisfy the standard
 			HAM_DEBUG( HAM_LOG << "request::get(), after MPI_Waitall()" << std::endl; )
             if(uses_rma_)
             {
                 MPI_Win_flush(target_node, communicator::instance().peers[target_node].rma_data_win);
+                // this is just a dummy return, there is no reply from the target for rma data transfers
+                // TODO, Daniel - design decision on what to return here
+                return static_cast<void*>(&communicator::instance().peers[communicator::this_node()].msg_data[local_buffer_index]);
+            } else {
+                return communicator::instance().recv_msg(target_node, local_buffer_index);
             }
-			return static_cast<void*>(&communicator::instance().peers[target_node].msg_buffers[recv_buffer_index]);
-            */
-            return communicator::instance().recv_msg(target_node, local_buffer_index);
 		}
 
 		template<class T>

From 3f6c30694d49fd8e8e22f7750de82dcdc695377f Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Fri, 27 Apr 2018 14:45:23 +0200
Subject: [PATCH 039/150] added debug output to track double-free issue

---
 include/ham/net/communicator_mpi_rma_dynamic.hpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index 20e8e99..f04a3b6 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -419,7 +419,7 @@ class communicator {
 
     // make private?!
     // called by function below
-    void * recv_msg(node_t node, size_t buffer_index = NO_BUFFER_INDEX, void* msg = nullptr, size_t size = constants::MSG_SIZE)
+    void* recv_msg(node_t node, size_t buffer_index = NO_BUFFER_INDEX, void* msg = nullptr, size_t size = constants::MSG_SIZE)
     {
         buffer_index = buffer_index == NO_BUFFER_INDEX ? peers[node].next_flag : buffer_index;
 
@@ -537,6 +537,7 @@ class communicator {
 		//int err =
 		posix_memalign((void**)&ptr, constants::CACHE_LINE_SIZE, n * sizeof(T));
         // attach to own window
+        HAM_DEBUG( cout << "allocated buffer @: " << ptr << " on node: " << node << std::endl; )
         MPI_Win_attach(peers[this_node_].rma_data_win, (void*)ptr, n * sizeof(T));
         /* for (node_t i = 1; i < nodes_; ++i) { // nonsense, all accesses to a rank will only take place on that targets window, no need to attach to other
             MPI_Win_attach(peers[i].rma_data_win, (void*)ptr, n * sizeof(T));
@@ -567,6 +568,7 @@ class communicator {
 		assert(ptr.node() == this_node_);
 		// NOTE: no dtor is called
         // remove from own rma window
+        HAM_DEBUG( cout << "freeing buffer @: " << ptr << " on node: " << node << std::endl; )
         MPI_Win_detach(peers[this_node_].rma_data_win, ptr.get());
         /* for (node_t i = 1; i < nodes_; ++i) { // nonsense, all accesses to a rank will only take place on that targets window, no need to attach to other
             MPI_Win_detach(peers[i].rma_data_win, ptr.get());

From 81675fb08a0e6cee6182d893a140be71e42c5ebc Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Fri, 27 Apr 2018 14:46:35 +0200
Subject: [PATCH 040/150] added debug output to track double-free issue

---
 include/ham/net/communicator_mpi_rma_dynamic.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index f04a3b6..7906b54 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -568,7 +568,7 @@ class communicator {
 		assert(ptr.node() == this_node_);
 		// NOTE: no dtor is called
         // remove from own rma window
-        HAM_DEBUG( cout << "freeing buffer @: " << ptr << " on node: " << node << std::endl; )
+        HAM_DEBUG( cout << "freeing buffer @: " << ptr << " on node: " << this_node_ << std::endl; )
         MPI_Win_detach(peers[this_node_].rma_data_win, ptr.get());
         /* for (node_t i = 1; i < nodes_; ++i) { // nonsense, all accesses to a rank will only take place on that targets window, no need to attach to other
             MPI_Win_detach(peers[i].rma_data_win, ptr.get());

From 7e3ead35a2d548a5f11ea1a4bdca2d01c67209b5 Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Fri, 27 Apr 2018 14:47:37 +0200
Subject: [PATCH 041/150] added debug output to track double-free issue

---
 include/ham/net/communicator_mpi_rma_dynamic.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index 7906b54..cbe23b6 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -537,7 +537,7 @@ class communicator {
 		//int err =
 		posix_memalign((void**)&ptr, constants::CACHE_LINE_SIZE, n * sizeof(T));
         // attach to own window
-        HAM_DEBUG( cout << "allocated buffer @: " << ptr << " on node: " << node << std::endl; )
+        HAM_DEBUG( std::cout << "allocated buffer @: " << ptr << " on node: " << node << std::endl; )
         MPI_Win_attach(peers[this_node_].rma_data_win, (void*)ptr, n * sizeof(T));
         /* for (node_t i = 1; i < nodes_; ++i) { // nonsense, all accesses to a rank will only take place on that targets window, no need to attach to other
             MPI_Win_attach(peers[i].rma_data_win, (void*)ptr, n * sizeof(T));
@@ -568,7 +568,7 @@ class communicator {
 		assert(ptr.node() == this_node_);
 		// NOTE: no dtor is called
         // remove from own rma window
-        HAM_DEBUG( cout << "freeing buffer @: " << ptr << " on node: " << this_node_ << std::endl; )
+        HAM_DEBUG( std::cout << "freeing buffer @: " << ptr << " on node: " << this_node_ << std::endl; )
         MPI_Win_detach(peers[this_node_].rma_data_win, ptr.get());
         /* for (node_t i = 1; i < nodes_; ++i) { // nonsense, all accesses to a rank will only take place on that targets window, no need to attach to other
             MPI_Win_detach(peers[i].rma_data_win, ptr.get());

From 0d0923a3b33c0d787ccb032adff9bf7ce25ea9f3 Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Fri, 27 Apr 2018 14:49:59 +0200
Subject: [PATCH 042/150] added debug output to track double-free issue

---
 include/ham/net/communicator_mpi_rma_dynamic.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index cbe23b6..aa64160 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -537,7 +537,7 @@ class communicator {
 		//int err =
 		posix_memalign((void**)&ptr, constants::CACHE_LINE_SIZE, n * sizeof(T));
         // attach to own window
-        HAM_DEBUG( std::cout << "allocated buffer @: " << ptr << " on node: " << node << std::endl; )
+        HAM_DEBUG( HAM_LOG << "allocated buffer @: " << ptr << " on node: " << node << std::endl; )
         MPI_Win_attach(peers[this_node_].rma_data_win, (void*)ptr, n * sizeof(T));
         /* for (node_t i = 1; i < nodes_; ++i) { // nonsense, all accesses to a rank will only take place on that targets window, no need to attach to other
             MPI_Win_attach(peers[i].rma_data_win, (void*)ptr, n * sizeof(T));
@@ -568,7 +568,7 @@ class communicator {
 		assert(ptr.node() == this_node_);
 		// NOTE: no dtor is called
         // remove from own rma window
-        HAM_DEBUG( std::cout << "freeing buffer @: " << ptr << " on node: " << this_node_ << std::endl; )
+        HAM_DEBUG( HAM_LOG << "freeing buffer @: " << ptr << " on node: " << this_node_ << std::endl; )
         MPI_Win_detach(peers[this_node_].rma_data_win, ptr.get());
         /* for (node_t i = 1; i < nodes_; ++i) { // nonsense, all accesses to a rank will only take place on that targets window, no need to attach to other
             MPI_Win_detach(peers[i].rma_data_win, ptr.get());

From d7fa6abfa7d456f33f221e41e7731794314f695c Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Fri, 27 Apr 2018 14:52:04 +0200
Subject: [PATCH 043/150] added debug output to track double-free issue

---
 include/ham/net/communicator_mpi_rma_dynamic.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index aa64160..4580664 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -537,7 +537,7 @@ class communicator {
 		//int err =
 		posix_memalign((void**)&ptr, constants::CACHE_LINE_SIZE, n * sizeof(T));
         // attach to own window
-        HAM_DEBUG( HAM_LOG << "allocated buffer @: " << ptr << " on node: " << node << std::endl; )
+        HAM_DEBUG( HAM_LOG << "allocated buffer @: " << ptr << " on node: " << source_node << std::endl; )
         MPI_Win_attach(peers[this_node_].rma_data_win, (void*)ptr, n * sizeof(T));
         /* for (node_t i = 1; i < nodes_; ++i) { // nonsense, all accesses to a rank will only take place on that targets window, no need to attach to other
             MPI_Win_attach(peers[i].rma_data_win, (void*)ptr, n * sizeof(T));

From fc5e2e58ae904f6a6054a87256ee0c91bd36758b Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Fri, 27 Apr 2018 14:54:15 +0200
Subject: [PATCH 044/150] added debug output to track double-free issue

---
 include/ham/net/communicator_mpi_rma_dynamic.hpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index 4580664..556f75e 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -537,7 +537,8 @@ class communicator {
 		//int err =
 		posix_memalign((void**)&ptr, constants::CACHE_LINE_SIZE, n * sizeof(T));
         // attach to own window
-        HAM_DEBUG( HAM_LOG << "allocated buffer @: " << ptr << " on node: " << source_node << std::endl; )
+        HAM_DEBUG( HAM_LOG << "allocated buffer @: " << ptr << std::endl; )
+        HAM_DEBUG( HAM_LOG << "on node: " << source_node << std::endl; )
         MPI_Win_attach(peers[this_node_].rma_data_win, (void*)ptr, n * sizeof(T));
         /* for (node_t i = 1; i < nodes_; ++i) { // nonsense, all accesses to a rank will only take place on that targets window, no need to attach to other
             MPI_Win_attach(peers[i].rma_data_win, (void*)ptr, n * sizeof(T));
@@ -568,7 +569,8 @@ class communicator {
 		assert(ptr.node() == this_node_);
 		// NOTE: no dtor is called
         // remove from own rma window
-        HAM_DEBUG( HAM_LOG << "freeing buffer @: " << ptr << " on node: " << this_node_ << std::endl; )
+        HAM_DEBUG( HAM_LOG << "freeing buffer @: " << ptr << std::endl; )
+        HAM_DEBUG( HAM_LOG << "on node: " << source_node << std::endl; )
         MPI_Win_detach(peers[this_node_].rma_data_win, ptr.get());
         /* for (node_t i = 1; i < nodes_; ++i) { // nonsense, all accesses to a rank will only take place on that targets window, no need to attach to other
             MPI_Win_detach(peers[i].rma_data_win, ptr.get());

From cf40181e8ba7d9dc192e7ea94cbddd0102291d50 Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Fri, 27 Apr 2018 14:55:58 +0200
Subject: [PATCH 045/150] added debug output to track double-free issue

---
 include/ham/net/communicator_mpi_rma_dynamic.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index 556f75e..fd6003c 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -569,8 +569,8 @@ class communicator {
 		assert(ptr.node() == this_node_);
 		// NOTE: no dtor is called
         // remove from own rma window
-        HAM_DEBUG( HAM_LOG << "freeing buffer @: " << ptr << std::endl; )
-        HAM_DEBUG( HAM_LOG << "on node: " << source_node << std::endl; )
+        HAM_DEBUG( HAM_LOG << "freeing buffer @: " << (long)ptr << std::endl; )
+        HAM_DEBUG( HAM_LOG << "on node: " << this_node_n << std::endl; )
         MPI_Win_detach(peers[this_node_].rma_data_win, ptr.get());
         /* for (node_t i = 1; i < nodes_; ++i) { // nonsense, all accesses to a rank will only take place on that targets window, no need to attach to other
             MPI_Win_detach(peers[i].rma_data_win, ptr.get());

From e31cb9bcc0a0945e7f4eef7f31f406877e63bda9 Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Fri, 27 Apr 2018 14:57:06 +0200
Subject: [PATCH 046/150] added debug output to track double-free issue

---
 include/ham/net/communicator_mpi_rma_dynamic.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index fd6003c..845966e 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -570,7 +570,7 @@ class communicator {
 		// NOTE: no dtor is called
         // remove from own rma window
         HAM_DEBUG( HAM_LOG << "freeing buffer @: " << (long)ptr << std::endl; )
-        HAM_DEBUG( HAM_LOG << "on node: " << this_node_n << std::endl; )
+        HAM_DEBUG( HAM_LOG << "on node: " << this_node_ << std::endl; )
         MPI_Win_detach(peers[this_node_].rma_data_win, ptr.get());
         /* for (node_t i = 1; i < nodes_; ++i) { // nonsense, all accesses to a rank will only take place on that targets window, no need to attach to other
             MPI_Win_detach(peers[i].rma_data_win, ptr.get());

From fbaaf63e9376fc05054aa923d0b3685a12799683 Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Fri, 27 Apr 2018 14:59:01 +0200
Subject: [PATCH 047/150] added debug output to track double-free issue

---
 include/ham/net/communicator_mpi_rma_dynamic.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index 845966e..44c5066 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -537,7 +537,7 @@ class communicator {
 		//int err =
 		posix_memalign((void**)&ptr, constants::CACHE_LINE_SIZE, n * sizeof(T));
         // attach to own window
-        HAM_DEBUG( HAM_LOG << "allocated buffer @: " << ptr << std::endl; )
+        HAM_DEBUG( HAM_LOG << "allocated buffer @: " << ptr.get() << std::endl; )
         HAM_DEBUG( HAM_LOG << "on node: " << source_node << std::endl; )
         MPI_Win_attach(peers[this_node_].rma_data_win, (void*)ptr, n * sizeof(T));
         /* for (node_t i = 1; i < nodes_; ++i) { // nonsense, all accesses to a rank will only take place on that targets window, no need to attach to other
@@ -569,7 +569,7 @@ class communicator {
 		assert(ptr.node() == this_node_);
 		// NOTE: no dtor is called
         // remove from own rma window
-        HAM_DEBUG( HAM_LOG << "freeing buffer @: " << (long)ptr << std::endl; )
+        HAM_DEBUG( HAM_LOG << "freeing buffer @: " << ptr.get() << std::endl; )
         HAM_DEBUG( HAM_LOG << "on node: " << this_node_ << std::endl; )
         MPI_Win_detach(peers[this_node_].rma_data_win, ptr.get());
         /* for (node_t i = 1; i < nodes_; ++i) { // nonsense, all accesses to a rank will only take place on that targets window, no need to attach to other

From 383eb93c64667e4e350f8f5b92e850d34e840e81 Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Fri, 27 Apr 2018 15:00:38 +0200
Subject: [PATCH 048/150] added debug output to track double-free issue

---
 include/ham/net/communicator_mpi_rma_dynamic.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index 44c5066..3bd2d16 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -537,7 +537,7 @@ class communicator {
 		//int err =
 		posix_memalign((void**)&ptr, constants::CACHE_LINE_SIZE, n * sizeof(T));
         // attach to own window
-        HAM_DEBUG( HAM_LOG << "allocated buffer @: " << ptr.get() << std::endl; )
+        HAM_DEBUG( HAM_LOG << "allocated buffer @: " << ptr << std::endl; )
         HAM_DEBUG( HAM_LOG << "on node: " << source_node << std::endl; )
         MPI_Win_attach(peers[this_node_].rma_data_win, (void*)ptr, n * sizeof(T));
         /* for (node_t i = 1; i < nodes_; ++i) { // nonsense, all accesses to a rank will only take place on that targets window, no need to attach to other

From d2b08f54224039aa4f35c6b1cf492974fc2ac941 Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Fri, 27 Apr 2018 15:10:13 +0200
Subject: [PATCH 049/150] added debug output to track double-free issue

---
 include/ham/net/communicator_mpi_rma_dynamic.hpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index 3bd2d16..1798bc5 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -537,8 +537,7 @@ class communicator {
 		//int err =
 		posix_memalign((void**)&ptr, constants::CACHE_LINE_SIZE, n * sizeof(T));
         // attach to own window
-        HAM_DEBUG( HAM_LOG << "allocated buffer @: " << ptr << std::endl; )
-        HAM_DEBUG( HAM_LOG << "on node: " << source_node << std::endl; )
+        HAM_DEBUG( HAM_LOG << "allocating buffer @: " << (long)ptr << "belonging to node: " << source_node << std::endl; )
         MPI_Win_attach(peers[this_node_].rma_data_win, (void*)ptr, n * sizeof(T));
         /* for (node_t i = 1; i < nodes_; ++i) { // nonsense, all accesses to a rank will only take place on that targets window, no need to attach to other
             MPI_Win_attach(peers[i].rma_data_win, (void*)ptr, n * sizeof(T));
@@ -569,8 +568,7 @@ class communicator {
 		assert(ptr.node() == this_node_);
 		// NOTE: no dtor is called
         // remove from own rma window
-        HAM_DEBUG( HAM_LOG << "freeing buffer @: " << ptr.get() << std::endl; )
-        HAM_DEBUG( HAM_LOG << "on node: " << this_node_ << std::endl; )
+        HAM_DEBUG( HAM_LOG << "freeing buffer @: " << (long)ptr.get() << " belonging to node: " << ptr.node() << std::endl; )
         MPI_Win_detach(peers[this_node_].rma_data_win, ptr.get());
         /* for (node_t i = 1; i < nodes_; ++i) { // nonsense, all accesses to a rank will only take place on that targets window, no need to attach to other
             MPI_Win_detach(peers[i].rma_data_win, ptr.get());

From 26926631f057e5f8a233c0c02d43bdde9139b13f Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Fri, 27 Apr 2018 16:28:36 +0200
Subject: [PATCH 050/150] introduced alternate request allocation for data
 transfers

---
 include/ham/net/communicator_mpi_rma_dynamic.hpp | 10 ++++++++--
 include/ham/offload/offload.hpp                  |  4 ++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index 1798bc5..fce6410 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -358,6 +358,12 @@ class communicator {
 		return peers[remote_node].next_request;
 	}
 
+    // used for async rma data transfers, so they wont take up buffer indices they dont need
+    request allocate_data_request(node_t remote_node) {
+        HAM_DEBUG( HAM_LOG << "communicator::allocate_next_request(): remote_node = " << remote_node << std::endl; )
+        return { remote_node, this_node_, NO_BUFFER_INDEX, NO_BUFFER_INDEX };
+    }
+
     // only used by host
 	void free_request(request& req)
 	{
@@ -537,7 +543,7 @@ class communicator {
 		//int err =
 		posix_memalign((void**)&ptr, constants::CACHE_LINE_SIZE, n * sizeof(T));
         // attach to own window
-        HAM_DEBUG( HAM_LOG << "allocating buffer @: " << (long)ptr << "belonging to node: " << source_node << std::endl; )
+        HAM_DEBUG( HAM_LOG << "communicator::allocate_buffer(), allocating buffer @: " << (long)ptr << " belonging to node: " << source_node << std::endl; )
         MPI_Win_attach(peers[this_node_].rma_data_win, (void*)ptr, n * sizeof(T));
         /* for (node_t i = 1; i < nodes_; ++i) { // nonsense, all accesses to a rank will only take place on that targets window, no need to attach to other
             MPI_Win_attach(peers[i].rma_data_win, (void*)ptr, n * sizeof(T));
@@ -568,7 +574,7 @@ class communicator {
 		assert(ptr.node() == this_node_);
 		// NOTE: no dtor is called
         // remove from own rma window
-        HAM_DEBUG( HAM_LOG << "freeing buffer @: " << (long)ptr.get() << " belonging to node: " << ptr.node() << std::endl; )
+        HAM_DEBUG( HAM_LOG << "communicator::allocate_buffer(), freeing buffer @: " << (long)ptr.get() << " belonging to node: " << ptr.node() << std::endl; )
         MPI_Win_detach(peers[this_node_].rma_data_win, ptr.get());
         /* for (node_t i = 1; i < nodes_; ++i) { // nonsense, all accesses to a rank will only take place on that targets window, no need to attach to other
             MPI_Win_detach(peers[i].rma_data_win, ptr.get());
diff --git a/include/ham/offload/offload.hpp b/include/ham/offload/offload.hpp
index 1c2e78c..a315c50 100644
--- a/include/ham/offload/offload.hpp
+++ b/include/ham/offload/offload.hpp
@@ -235,7 +235,7 @@ future<void> put(T* local_source, buffer_ptr<T>& remote_dest, size_t n)
 	
 	return result;
 #elif HAM_COMM_MPI_RMA_DYNAMIC
-    future<void> result(comm.allocate_request(remote_dest.node()));
+    future<void> result(comm.allocate_data_request(remote_dest.node()));
 	HAM_DEBUG( HAM_LOG << "offload::put(): initiating RMA put..." << std::endl; )
 	comm.send_data_async(result.get_request(), local_source, remote_dest, n);
     return result;
@@ -273,7 +273,7 @@ future<void> get(buffer_ptr<T> remote_source, T* local_dest, size_t n)
 
 	return result;
 #elif defined HAM_COMM_MPI_RMA_DYNAMIC
-	future<void> result(comm.allocate_request(remote_source.node()));
+	future<void> result(comm.allocate_data_request(remote_source.node()));
 	HAM_DEBUG( HAM_LOG << "offload::put(): initiating RMA get..." << std::endl; )
 	comm.recv_data_async(result.get_request(), remote_source, local_dest, n);
     return result;

From ebc3feba9b8d8b5d0460c8fbf392edabdf1b9c17 Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Fri, 27 Apr 2018 17:10:53 +0200
Subject: [PATCH 051/150] fixed data requests freeing indices @ invalidation

---
 include/ham/net/communicator_mpi_rma_dynamic.hpp | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index fce6410..87ef0a4 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -383,8 +383,14 @@ class communicator {
         MPI_Put(&remote_flag, 1, MPI_INT64_T, req.target_node, 0, 1, MPI_INT64_T, peer.flag_win);
         // flush? don't think so
 
-		peer.remote_buffer_pool.free(req.remote_buffer_index);
-		peer.local_buffer_pool.free(req.local_buffer_index);
+        // only free buffer indices if they are valid
+        // necessary to avoid data transfer requests that do not allocate indices messing up the index pools
+        if(req.remote_buffer_index < NO_BUFFER_INDEX ) {
+            peer.remote_buffer_pool.free(req.remote_buffer_index);
+        }
+        if(req.local_buffer_index < NO_BUFFER_INDEX) {
+            peer.local_buffer_pool.free(req.local_buffer_index);
+        }
 
 		req.valid_ = false;
 	}
@@ -574,7 +580,7 @@ class communicator {
 		assert(ptr.node() == this_node_);
 		// NOTE: no dtor is called
         // remove from own rma window
-        HAM_DEBUG( HAM_LOG << "communicator::allocate_buffer(), freeing buffer @: " << (long)ptr.get() << " belonging to node: " << ptr.node() << std::endl; )
+        HAM_DEBUG( HAM_LOG << "communicator::free_buffer(), freeing buffer @: " << (long)ptr.get() << " belonging to node: " << ptr.node() << std::endl; )
         MPI_Win_detach(peers[this_node_].rma_data_win, ptr.get());
         /* for (node_t i = 1; i < nodes_; ++i) { // nonsense, all accesses to a rank will only take place on that targets window, no need to attach to other
             MPI_Win_detach(peers[i].rma_data_win, ptr.get());

From c0894ff6f469015768325596b099a968b84e2a04 Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Fri, 27 Apr 2018 17:21:55 +0200
Subject: [PATCH 052/150] fixed data requests freeing indices @ invalidation

---
 include/ham/net/communicator_mpi_rma_dynamic.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index 87ef0a4..9d46ace 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -380,7 +380,7 @@ class communicator {
         *local_flag= FLAG_FALSE;
         // remote flag on target
         size_t remote_flag = FLAG_FALSE;
-        MPI_Put(&remote_flag, 1, MPI_INT64_T, req.target_node, 0, 1, MPI_INT64_T, peer.flag_win);
+        MPI_Put(&remote_flag, sizeof(remote_flag), MPI_BYTE, req.target_node, 0, sizeof(remote_flag), MPI_BYTE, peer.flag_win);
         // flush? don't think so
 
         // only free buffer indices if they are valid

From 49179641e1abfde85da4256891646bdba39b65c3 Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Fri, 27 Apr 2018 17:30:13 +0200
Subject: [PATCH 053/150] fixed data requests freeing indices @ invalidation

---
 .../ham/net/communicator_mpi_rma_dynamic.hpp  | 25 ++++++++++---------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index 9d46ace..e5f7f3e 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -369,8 +369,13 @@ class communicator {
 	{
 		assert(req.valid());
 		assert(req.source_node == this_node_);
-	
-		mpi_peer& peer = peers[req.target_node];
+
+        // dont do any of the following for data transfer requests
+        if(req.remote_buffer_index == NO_BUFFER_INDEX ) {
+            return;
+        }
+
+        mpi_peer& peer = peers[req.target_node];
 
         // set flags to false
         // local flag inside large host flag buffer @ peers[host]
@@ -383,17 +388,13 @@ class communicator {
         MPI_Put(&remote_flag, sizeof(remote_flag), MPI_BYTE, req.target_node, 0, sizeof(remote_flag), MPI_BYTE, peer.flag_win);
         // flush? don't think so
 
-        // only free buffer indices if they are valid
-        // necessary to avoid data transfer requests that do not allocate indices messing up the index pools
-        if(req.remote_buffer_index < NO_BUFFER_INDEX ) {
-            peer.remote_buffer_pool.free(req.remote_buffer_index);
-        }
-        if(req.local_buffer_index < NO_BUFFER_INDEX) {
-            peer.local_buffer_pool.free(req.local_buffer_index);
-        }
 
-		req.valid_ = false;
-	}
+        peer.remote_buffer_pool.free(req.remote_buffer_index);
+
+        peer.local_buffer_pool.free(req.local_buffer_index);
+
+        req.valid_ = false;
+    }
 
 public:
     // make private?!

From 112e5b883b05f7a0e410389b3a651b88c3ba58b5 Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Fri, 27 Apr 2018 17:37:53 +0200
Subject: [PATCH 054/150] fixed data requests freeing indices @ invalidation

---
 include/ham/net/communicator_mpi_rma_dynamic.hpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index e5f7f3e..b44ff62 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -377,6 +377,7 @@ class communicator {
 
         mpi_peer& peer = peers[req.target_node];
 
+
         // set flags to false
         // local flag inside large host flag buffer @ peers[host]
         // index offset computed using target node
@@ -384,10 +385,11 @@ class communicator {
         volatile size_t* local_flag = reinterpret_cast<size_t*>(&peers[host_node_].flag_data.get()[offset + req.local_buffer_index]);
         *local_flag= FLAG_FALSE;
         // remote flag on target
+        /* This is done by the target after having reveived the new index to poll on
         size_t remote_flag = FLAG_FALSE;
         MPI_Put(&remote_flag, sizeof(remote_flag), MPI_BYTE, req.target_node, 0, sizeof(remote_flag), MPI_BYTE, peer.flag_win);
         // flush? don't think so
-
+        */
 
         peer.remote_buffer_pool.free(req.remote_buffer_index);
 
@@ -451,6 +453,8 @@ class communicator {
         if (*local_flag != NO_BUFFER_INDEX) // the flag contains the next buffer index to poll on
             peers[node].next_flag = *local_flag;
 
+        *local_flag = FLAG_FALSE;
+
         if (this_node_ == host_node_) {
             size_t offset = constants::MSG_BUFFERS * node;
             return &peers[host_node_].msg_data.get()[offset + buffer_index];

From 314423e0060dfbea7114b63c3f0c25b882544287 Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Fri, 27 Apr 2018 18:12:28 +0200
Subject: [PATCH 055/150] trying withou flush

---
 include/ham/net/communicator_mpi_rma_dynamic.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index b44ff62..5b80fdd 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -407,7 +407,7 @@ class communicator {
             MPI_Put(msg, size, MPI_BYTE, node, sizeof(msg_buffer) * buffer_index, size, MPI_BYTE, peers[node].msg_win);
 
             // TODO DANIEL: because MPI does not guarantee order on RMA ops, there might be a FLUSH necessary here
-            MPI_Win_flush(node, peers[node].msg_win);
+            //MPI_Win_flush(node, peers[node].msg_win);
             // write flag to target flags buffer
             // not sure on the size here?
             MPI_Put(&next_buffer_index, sizeof(next_buffer_index), MPI_BYTE, node, sizeof(cache_line_buffer) * buffer_index, sizeof(next_buffer_index), MPI_BYTE, peers[node].flag_win);
@@ -554,7 +554,7 @@ class communicator {
 		//int err =
 		posix_memalign((void**)&ptr, constants::CACHE_LINE_SIZE, n * sizeof(T));
         // attach to own window
-        HAM_DEBUG( HAM_LOG << "communicator::allocate_buffer(), allocating buffer @: " << (long)ptr << " belonging to node: " << source_node << std::endl; )
+        HAM_DEBUG( HAM_LOG << "communicator::allocate_buffer(), allocating buffer @: " << (long)ptr << std::endl; )
         MPI_Win_attach(peers[this_node_].rma_data_win, (void*)ptr, n * sizeof(T));
         /* for (node_t i = 1; i < nodes_; ++i) { // nonsense, all accesses to a rank will only take place on that targets window, no need to attach to other
             MPI_Win_attach(peers[i].rma_data_win, (void*)ptr, n * sizeof(T));

From f0ebfd5e128c05cc43eaabe30dd3631bdb40af8c Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Fri, 27 Apr 2018 18:20:06 +0200
Subject: [PATCH 056/150] trying withou flush

---
 include/ham/net/communicator_mpi_rma_dynamic.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index 5b80fdd..1eb5d7a 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -414,7 +414,7 @@ class communicator {
         } else { // to host, used by send_result
             size_t offset = constants::MSG_BUFFERS * this_node_;
             MPI_Put(msg, size, MPI_BYTE, node, sizeof(msg_buffer) * (offset + buffer_index), size, MPI_BYTE, peers[node].msg_win);
-            MPI_Win_flush(node, peers[node].msg_win);
+            //MPI_Win_flush(node, peers[node].msg_win);
             MPI_Put(&next_buffer_index, sizeof(next_buffer_index), MPI_BYTE, node, sizeof(cache_line_buffer) * (offset + buffer_index), sizeof(next_buffer_index), MPI_BYTE, peers[node].flag_win);
         }
     }

From b2ebd109b693e82557f91837ec9d84f6bb46cada Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Wed, 2 May 2018 12:28:11 +0200
Subject: [PATCH 057/150] added logging to send/recv msg

---
 .../ham/net/communicator_mpi_rma_dynamic.hpp  | 23 ++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index 1eb5d7a..aec8598 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -403,19 +403,33 @@ class communicator {
     // called by func below
     void send_msg(node_t node, size_t buffer_index, size_t next_buffer_index, void* msg, size_t size) {
         // write msg to target msg buffer
+        HAM_DEBUG( HAM_LOG << "communicator::send_msg(): node =  " << node << std::endl; )
+        HAM_DEBUG( HAM_LOG << "communicator::send_msg(): remote buffer index = " << buffer_index << std::endl; )
+
         if (node != host_node_) { // to targets
             MPI_Put(msg, size, MPI_BYTE, node, sizeof(msg_buffer) * buffer_index, size, MPI_BYTE, peers[node].msg_win);
+            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): wrote msg" << std::endl; )
 
             // TODO DANIEL: because MPI does not guarantee order on RMA ops, there might be a FLUSH necessary here
-            //MPI_Win_flush(node, peers[node].msg_win);
+            MPI_Win_flush(node, peers[node].msg_win);
+            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): flushed msg" << std::endl; )
+
             // write flag to target flags buffer
             // not sure on the size here?
             MPI_Put(&next_buffer_index, sizeof(next_buffer_index), MPI_BYTE, node, sizeof(cache_line_buffer) * buffer_index, sizeof(next_buffer_index), MPI_BYTE, peers[node].flag_win);
+            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): wrote flag" << std::endl; )
+
         } else { // to host, used by send_result
             size_t offset = constants::MSG_BUFFERS * this_node_;
             MPI_Put(msg, size, MPI_BYTE, node, sizeof(msg_buffer) * (offset + buffer_index), size, MPI_BYTE, peers[node].msg_win);
-            //MPI_Win_flush(node, peers[node].msg_win);
+            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): wrote msg" << std::endl; )
+
+            MPI_Win_flush(node, peers[node].msg_win);
+            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): flushed msg" << std::endl; )
+
             MPI_Put(&next_buffer_index, sizeof(next_buffer_index), MPI_BYTE, node, sizeof(cache_line_buffer) * (offset + buffer_index), sizeof(next_buffer_index), MPI_BYTE, peers[node].flag_win);
+            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): wrote flag" << std::endl; )
+
         }
     }
 
@@ -437,6 +451,8 @@ class communicator {
     void* recv_msg(node_t node, size_t buffer_index = NO_BUFFER_INDEX, void* msg = nullptr, size_t size = constants::MSG_SIZE)
     {
         buffer_index = buffer_index == NO_BUFFER_INDEX ? peers[node].next_flag : buffer_index;
+        HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): remote node is: " << node << std::endl; )
+		HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): using buffer index: " << buffer_index << std::endl; )
 
         volatile size_t* local_flag;
 
@@ -447,8 +463,9 @@ class communicator {
             local_flag = reinterpret_cast<size_t*>(&peers[this_node_].flag_data.get()[buffer_index]);
         }
 
-
+        HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): FLAG before polling: " << (int)*local_flag << std::endl; )
         while (*local_flag == FLAG_FALSE); // poll on flag for completion
+        HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): FLAG after polling: " << (int)*local_flag << std::endl; )
 
         if (*local_flag != NO_BUFFER_INDEX) // the flag contains the next buffer index to poll on
             peers[node].next_flag = *local_flag;

From d2c555291fc381b3e5b784971a49008eaf92a83c Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Wed, 2 May 2018 13:33:37 +0200
Subject: [PATCH 058/150] added time logging to recv msg

---
 include/ham/net/communicator_mpi_rma_dynamic.hpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index aec8598..e412cc1 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -17,6 +17,7 @@
 #include "ham/misc/types.hpp"
 #include "ham/util/debug.hpp"
 #include "ham/util/log.hpp"
+#include "ham/util/time.hpp"
 #include "communicator.hpp"
 
 namespace ham {
@@ -450,6 +451,9 @@ class communicator {
     // called by function below
     void* recv_msg(node_t node, size_t buffer_index = NO_BUFFER_INDEX, void* msg = nullptr, size_t size = constants::MSG_SIZE)
     {
+        statistics pre_poll(1,0);
+        statistics poll(1,0);
+        timer t1;
         buffer_index = buffer_index == NO_BUFFER_INDEX ? peers[node].next_flag : buffer_index;
         HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): remote node is: " << node << std::endl; )
 		HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): using buffer index: " << buffer_index << std::endl; )
@@ -464,8 +468,14 @@ class communicator {
         }
 
         HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): FLAG before polling: " << (int)*local_flag << std::endl; )
+        pre_poll.add(t1);
+        HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): pre-polling took: " << pre_poll.min() << std::endl; )
+        timer t2;
         while (*local_flag == FLAG_FALSE); // poll on flag for completion
+        poll.add(t2);
         HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): FLAG after polling: " << (int)*local_flag << std::endl; )
+        HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): polling took: " << poll.min() << std::endl; )
+
 
         if (*local_flag != NO_BUFFER_INDEX) // the flag contains the next buffer index to poll on
             peers[node].next_flag = *local_flag;

From fdc1b9e0f74ea46f05901c4de1c00bbabb72ecfe Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Wed, 2 May 2018 13:39:30 +0200
Subject: [PATCH 059/150] added time logging to recv msg

---
 include/ham/net/communicator_mpi_rma_dynamic.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index e412cc1..0ad10bc 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -451,9 +451,9 @@ class communicator {
     // called by function below
     void* recv_msg(node_t node, size_t buffer_index = NO_BUFFER_INDEX, void* msg = nullptr, size_t size = constants::MSG_SIZE)
     {
-        statistics pre_poll(1,0);
-        statistics poll(1,0);
-        timer t1;
+        ham::util::time::statistics pre_poll(1,0);
+        ham::util::time::statistics poll(1,0);
+        ham::util::time::timer t1;
         buffer_index = buffer_index == NO_BUFFER_INDEX ? peers[node].next_flag : buffer_index;
         HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): remote node is: " << node << std::endl; )
 		HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): using buffer index: " << buffer_index << std::endl; )
@@ -470,7 +470,7 @@ class communicator {
         HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): FLAG before polling: " << (int)*local_flag << std::endl; )
         pre_poll.add(t1);
         HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): pre-polling took: " << pre_poll.min() << std::endl; )
-        timer t2;
+        ham::util::time::timer t2;
         while (*local_flag == FLAG_FALSE); // poll on flag for completion
         poll.add(t2);
         HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): FLAG after polling: " << (int)*local_flag << std::endl; )

From a4039134d52446c1fabcfd1854e631e9aa7ed873 Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Wed, 2 May 2018 13:41:45 +0200
Subject: [PATCH 060/150] added time logging to recv msg

---
 include/ham/net/communicator_mpi_rma_dynamic.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index 0ad10bc..2385ac0 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -469,12 +469,12 @@ class communicator {
 
         HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): FLAG before polling: " << (int)*local_flag << std::endl; )
         pre_poll.add(t1);
-        HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): pre-polling took: " << pre_poll.min() << std::endl; )
+        HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): pre-polling took: " << pre_poll.min().count() << std::endl; )
         ham::util::time::timer t2;
         while (*local_flag == FLAG_FALSE); // poll on flag for completion
         poll.add(t2);
         HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): FLAG after polling: " << (int)*local_flag << std::endl; )
-        HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): polling took: " << poll.min() << std::endl; )
+        HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): polling took: " << poll.min().count() << std::endl; )
 
 
         if (*local_flag != NO_BUFFER_INDEX) // the flag contains the next buffer index to poll on

From 6e84d1768d479dcdfbcbd8d961b06c6f75d8648c Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Wed, 2 May 2018 13:48:55 +0200
Subject: [PATCH 061/150] added time logging to send msg

---
 include/ham/net/communicator_mpi_rma_dynamic.hpp | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index 2385ac0..eebd2f2 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -408,17 +408,31 @@ class communicator {
         HAM_DEBUG( HAM_LOG << "communicator::send_msg(): remote buffer index = " << buffer_index << std::endl; )
 
         if (node != host_node_) { // to targets
+            ham::util::time::statistics msg_put(1,0);
+            ham::util::time::statistics flush(1,0);
+            ham::util::time::statistics flag_put(1,0);
+
+            ham::util::time::timer t1;
             MPI_Put(msg, size, MPI_BYTE, node, sizeof(msg_buffer) * buffer_index, size, MPI_BYTE, peers[node].msg_win);
+            msg_put.add(t1);
             HAM_DEBUG( HAM_LOG << "communicator::send_msg(): wrote msg" << std::endl; )
+            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): writing msg took" << msg_put.min().count() << std::endl; )
+
 
             // TODO DANIEL: because MPI does not guarantee order on RMA ops, there might be a FLUSH necessary here
+            ham::util::time::timer t2;
             MPI_Win_flush(node, peers[node].msg_win);
+            flush.add(t2);
             HAM_DEBUG( HAM_LOG << "communicator::send_msg(): flushed msg" << std::endl; )
+            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): flushing msg took" << flush.min().count() << std::endl; )
 
             // write flag to target flags buffer
             // not sure on the size here?
+            ham::util::time::timer t3;
             MPI_Put(&next_buffer_index, sizeof(next_buffer_index), MPI_BYTE, node, sizeof(cache_line_buffer) * buffer_index, sizeof(next_buffer_index), MPI_BYTE, peers[node].flag_win);
+            flag_put.add(t3);
             HAM_DEBUG( HAM_LOG << "communicator::send_msg(): wrote flag" << std::endl; )
+            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): writing flag took" << flag_put.min().count() <<std::endl; )
 
         } else { // to host, used by send_result
             size_t offset = constants::MSG_BUFFERS * this_node_;

From 90067a8e499db5e0e4eb6c085df02c00ca1c3808 Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Wed, 2 May 2018 13:54:59 +0200
Subject: [PATCH 062/150] commented out Win_flush when putting msg

---
 include/ham/net/communicator_mpi_rma_dynamic.hpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index eebd2f2..96b7483 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -416,15 +416,15 @@ class communicator {
             MPI_Put(msg, size, MPI_BYTE, node, sizeof(msg_buffer) * buffer_index, size, MPI_BYTE, peers[node].msg_win);
             msg_put.add(t1);
             HAM_DEBUG( HAM_LOG << "communicator::send_msg(): wrote msg" << std::endl; )
-            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): writing msg took" << msg_put.min().count() << std::endl; )
+            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): writing msg took: " << msg_put.min().count() << std::endl; )
 
 
             // TODO DANIEL: because MPI does not guarantee order on RMA ops, there might be a FLUSH necessary here
             ham::util::time::timer t2;
-            MPI_Win_flush(node, peers[node].msg_win);
+            //MPI_Win_flush(node, peers[node].msg_win);
             flush.add(t2);
             HAM_DEBUG( HAM_LOG << "communicator::send_msg(): flushed msg" << std::endl; )
-            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): flushing msg took" << flush.min().count() << std::endl; )
+            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): flushing msg took: " << flush.min().count() << std::endl; )
 
             // write flag to target flags buffer
             // not sure on the size here?
@@ -432,14 +432,14 @@ class communicator {
             MPI_Put(&next_buffer_index, sizeof(next_buffer_index), MPI_BYTE, node, sizeof(cache_line_buffer) * buffer_index, sizeof(next_buffer_index), MPI_BYTE, peers[node].flag_win);
             flag_put.add(t3);
             HAM_DEBUG( HAM_LOG << "communicator::send_msg(): wrote flag" << std::endl; )
-            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): writing flag took" << flag_put.min().count() <<std::endl; )
+            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): writing flag took: " << flag_put.min().count() <<std::endl; )
 
         } else { // to host, used by send_result
             size_t offset = constants::MSG_BUFFERS * this_node_;
             MPI_Put(msg, size, MPI_BYTE, node, sizeof(msg_buffer) * (offset + buffer_index), size, MPI_BYTE, peers[node].msg_win);
             HAM_DEBUG( HAM_LOG << "communicator::send_msg(): wrote msg" << std::endl; )
 
-            MPI_Win_flush(node, peers[node].msg_win);
+            //MPI_Win_flush(node, peers[node].msg_win);
             HAM_DEBUG( HAM_LOG << "communicator::send_msg(): flushed msg" << std::endl; )
 
             MPI_Put(&next_buffer_index, sizeof(next_buffer_index), MPI_BYTE, node, sizeof(cache_line_buffer) * (offset + buffer_index), sizeof(next_buffer_index), MPI_BYTE, peers[node].flag_win);

From c9125c73924a9045affdd22e807516a47cfbe29e Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Wed, 2 May 2018 13:58:49 +0200
Subject: [PATCH 063/150] added time logging to send msg (target)

---
 include/ham/net/communicator_mpi_rma_dynamic.hpp | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index 96b7483..3eeb2e6 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -435,15 +435,28 @@ class communicator {
             HAM_DEBUG( HAM_LOG << "communicator::send_msg(): writing flag took: " << flag_put.min().count() <<std::endl; )
 
         } else { // to host, used by send_result
+            ham::util::time::statistics msg_put(1,0);
+            ham::util::time::statistics flush(1,0);
+            ham::util::time::statistics flag_put(1,0);
+
             size_t offset = constants::MSG_BUFFERS * this_node_;
+            ham::util::time::timer t1;
             MPI_Put(msg, size, MPI_BYTE, node, sizeof(msg_buffer) * (offset + buffer_index), size, MPI_BYTE, peers[node].msg_win);
+            msg_put.add(t1);
             HAM_DEBUG( HAM_LOG << "communicator::send_msg(): wrote msg" << std::endl; )
+            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): writing msg took: " << msg_put.min().count() << std::endl; )
 
+            ham::util::time::timer t2;
             //MPI_Win_flush(node, peers[node].msg_win);
+            flush.add(t2);
             HAM_DEBUG( HAM_LOG << "communicator::send_msg(): flushed msg" << std::endl; )
+            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): flushing msg took: " << flush.min().count() << std::endl; )
 
+            ham::util::time::timer t3;
             MPI_Put(&next_buffer_index, sizeof(next_buffer_index), MPI_BYTE, node, sizeof(cache_line_buffer) * (offset + buffer_index), sizeof(next_buffer_index), MPI_BYTE, peers[node].flag_win);
+            flag_put.add(t3);
             HAM_DEBUG( HAM_LOG << "communicator::send_msg(): wrote flag" << std::endl; )
+            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): writing flag took: " << flag_put.min().count() <<std::endl; )
 
         }
     }

From 4202bb92ac5690cf621e4ec502a6460da57ae552 Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Wed, 2 May 2018 14:58:33 +0200
Subject: [PATCH 064/150] removed unneeded target-target window locks

---
 include/ham/net/communicator_mpi_rma_dynamic.hpp | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index 3eeb2e6..852f34e 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -292,8 +292,16 @@ class communicator {
         for (node_t i = 0; i < nodes_; ++i) { // TODO(improvement): needs to be changed when host-rank becomes configurable
             if (i != this_node_) {
                 MPI_Win_lock(MPI_LOCK_SHARED, i, 0, peers[i].rma_data_win);  // shared locks because all ranks lock on every target concurrently
-                MPI_Win_lock(MPI_LOCK_SHARED, i, 0, peers[i].msg_win);  // shared locks because all ranks lock on every target concurrently
-                MPI_Win_lock(MPI_LOCK_SHARED, i, 0, peers[i].flag_win);  // shared locks because all ranks lock on every target concurrently
+            }
+        }
+
+        if (this_node_ != host_node_) { // targets
+            MPI_Win_lock(MPI_LOCK_SHARED, 0, 0, peers[0].msg_win);
+            MPI_Win_lock(MPI_LOCK_SHARED, 0, 0, peers[0].flag_win);
+        } else { // host
+            for (node_t i = 0; i < nodes_; ++i) {
+                MPI_Win_lock(MPI_LOCK_SHARED, i, 0, peers[i].msg_win);
+                MPI_Win_lock(MPI_LOCK_SHARED, i, 0, peers[i].flag_win);
             }
         }
 

From 178a870a9657a88a41fbf9ed3bdbd36c367f9be2 Mon Sep 17 00:00:00 2001
From: Phuzzyhead <deppisch@zib.de>
Date: Wed, 2 May 2018 15:43:46 +0200
Subject: [PATCH 065/150] commented Win_flush when putting msg back in

---
 include/ham/net/communicator_mpi_rma_dynamic.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index 852f34e..493f3c7 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -429,7 +429,7 @@ class communicator {
 
             // TODO DANIEL: because MPI does not guarantee order on RMA ops, there might be a FLUSH necessary here
             ham::util::time::timer t2;
-            //MPI_Win_flush(node, peers[node].msg_win);
+            MPI_Win_flush(node, peers[node].msg_win);
             flush.add(t2);
             HAM_DEBUG( HAM_LOG << "communicator::send_msg(): flushed msg" << std::endl; )
             HAM_DEBUG( HAM_LOG << "communicator::send_msg(): flushing msg took: " << flush.min().count() << std::endl; )
@@ -455,7 +455,7 @@ class communicator {
             HAM_DEBUG( HAM_LOG << "communicator::send_msg(): writing msg took: " << msg_put.min().count() << std::endl; )
 
             ham::util::time::timer t2;
-            //MPI_Win_flush(node, peers[node].msg_win);
+            MPI_Win_flush(node, peers[node].msg_win);
             flush.add(t2);
             HAM_DEBUG( HAM_LOG << "communicator::send_msg(): flushed msg" << std::endl; )
             HAM_DEBUG( HAM_LOG << "communicator::send_msg(): flushing msg took: " << flush.min().count() << std::endl; )

From 4ed3efe815f375aa21954aac3f6febac423024a1 Mon Sep 17 00:00:00 2001
From: bemdeppi <deppisch@zib.de>
Date: Mon, 9 Jul 2018 13:36:32 +0200
Subject: [PATCH 066/150] nonfunctional changes

---
 Jamroot                                       |  4 +-
 .../ham/net/communicator_mpi_rma_dynamic.hpp  | 66 +++++++++----------
 tools/install_boost.sh                        |  4 +-
 3 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/Jamroot b/Jamroot
index f832e4e..16b4f72 100644
--- a/Jamroot
+++ b/Jamroot
@@ -32,13 +32,13 @@ project HAM
 	<include>$(INC)
 	<include>$(BOOST_PATH)/include
 	<variant>debug:<define>HAM_DEBUG_ON
-#	<variant>debug_mic:<define>HAM_DEBUG_ON 
+	<variant>debug_mic:<define>HAM_DEBUG_ON
 #	<toolset>intel:<cflags>"-static-intel"
 	<inlining>on # off, on, full
 	#<optimization>speed # off, speed, space
 	<cxxflags>"-hstd=c++11"
 	<threading>multi
-#	<link>static
+	<link>static
 	: default-build release
 #	: default-build debug release debug_mic release_mic
 	;
diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index 493f3c7..f3cee5a 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -17,7 +17,7 @@
 #include "ham/misc/types.hpp"
 #include "ham/util/debug.hpp"
 #include "ham/util/log.hpp"
-#include "ham/util/time.hpp"
+// #include "ham/util/time.hpp"
 #include "communicator.hpp"
 
 namespace ham {
@@ -416,55 +416,55 @@ class communicator {
         HAM_DEBUG( HAM_LOG << "communicator::send_msg(): remote buffer index = " << buffer_index << std::endl; )
 
         if (node != host_node_) { // to targets
-            ham::util::time::statistics msg_put(1,0);
-            ham::util::time::statistics flush(1,0);
-            ham::util::time::statistics flag_put(1,0);
+            // ham::util::time::statistics msg_put(1,0);
+            // ham::util::time::statistics flush(1,0);
+            // ham::util::time::statistics flag_put(1,0);
 
-            ham::util::time::timer t1;
+            // ham::util::time::timer t1;
             MPI_Put(msg, size, MPI_BYTE, node, sizeof(msg_buffer) * buffer_index, size, MPI_BYTE, peers[node].msg_win);
-            msg_put.add(t1);
+            // msg_put.add(t1);
             HAM_DEBUG( HAM_LOG << "communicator::send_msg(): wrote msg" << std::endl; )
-            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): writing msg took: " << msg_put.min().count() << std::endl; )
+            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): writing msg took: " << ""/*msg_put.min().count()*/ << std::endl; )
 
 
             // TODO DANIEL: because MPI does not guarantee order on RMA ops, there might be a FLUSH necessary here
-            ham::util::time::timer t2;
+            // ham::util::time::timer t2;
             MPI_Win_flush(node, peers[node].msg_win);
-            flush.add(t2);
+            // flush.add(t2);
             HAM_DEBUG( HAM_LOG << "communicator::send_msg(): flushed msg" << std::endl; )
-            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): flushing msg took: " << flush.min().count() << std::endl; )
+            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): flushing msg took: " << ""/*flush.min().count()*/ << std::endl; )
 
             // write flag to target flags buffer
             // not sure on the size here?
-            ham::util::time::timer t3;
+            // ham::util::time::timer t3;
             MPI_Put(&next_buffer_index, sizeof(next_buffer_index), MPI_BYTE, node, sizeof(cache_line_buffer) * buffer_index, sizeof(next_buffer_index), MPI_BYTE, peers[node].flag_win);
-            flag_put.add(t3);
+            // flag_put.add(t3);
             HAM_DEBUG( HAM_LOG << "communicator::send_msg(): wrote flag" << std::endl; )
-            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): writing flag took: " << flag_put.min().count() <<std::endl; )
+            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): writing flag took: " << ""/*flag_put.min().count()*/ <<std::endl; )
 
         } else { // to host, used by send_result
-            ham::util::time::statistics msg_put(1,0);
-            ham::util::time::statistics flush(1,0);
-            ham::util::time::statistics flag_put(1,0);
+            // ham::util::time::statistics msg_put(1,0);
+            // ham::util::time::statistics flush(1,0);
+            // ham::util::time::statistics flag_put(1,0);
 
             size_t offset = constants::MSG_BUFFERS * this_node_;
-            ham::util::time::timer t1;
+            // ham::util::time::timer t1;
             MPI_Put(msg, size, MPI_BYTE, node, sizeof(msg_buffer) * (offset + buffer_index), size, MPI_BYTE, peers[node].msg_win);
-            msg_put.add(t1);
+            // msg_put.add(t1);
             HAM_DEBUG( HAM_LOG << "communicator::send_msg(): wrote msg" << std::endl; )
-            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): writing msg took: " << msg_put.min().count() << std::endl; )
+            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): writing msg took: " << ""/*msg_put.min().count()*/ << std::endl; )
 
-            ham::util::time::timer t2;
+            // ham::util::time::timer t2;
             MPI_Win_flush(node, peers[node].msg_win);
-            flush.add(t2);
+            // flush.add(t2);
             HAM_DEBUG( HAM_LOG << "communicator::send_msg(): flushed msg" << std::endl; )
-            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): flushing msg took: " << flush.min().count() << std::endl; )
+            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): flushing msg took: " << ""/*flush.min().count()*/ << std::endl; )
 
-            ham::util::time::timer t3;
+            // ham::util::time::timer t3;
             MPI_Put(&next_buffer_index, sizeof(next_buffer_index), MPI_BYTE, node, sizeof(cache_line_buffer) * (offset + buffer_index), sizeof(next_buffer_index), MPI_BYTE, peers[node].flag_win);
-            flag_put.add(t3);
+            // flag_put.add(t3);
             HAM_DEBUG( HAM_LOG << "communicator::send_msg(): wrote flag" << std::endl; )
-            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): writing flag took: " << flag_put.min().count() <<std::endl; )
+            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): writing flag took: " << ""/*flag_put.min().count()*/ <<std::endl; )
 
         }
     }
@@ -486,9 +486,9 @@ class communicator {
     // called by function below
     void* recv_msg(node_t node, size_t buffer_index = NO_BUFFER_INDEX, void* msg = nullptr, size_t size = constants::MSG_SIZE)
     {
-        ham::util::time::statistics pre_poll(1,0);
-        ham::util::time::statistics poll(1,0);
-        ham::util::time::timer t1;
+        // ham::util::time::statistics pre_poll(1,0);
+        // ham::util::time::statistics poll(1,0);
+        // ham::util::time::timer t1;
         buffer_index = buffer_index == NO_BUFFER_INDEX ? peers[node].next_flag : buffer_index;
         HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): remote node is: " << node << std::endl; )
 		HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): using buffer index: " << buffer_index << std::endl; )
@@ -503,13 +503,13 @@ class communicator {
         }
 
         HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): FLAG before polling: " << (int)*local_flag << std::endl; )
-        pre_poll.add(t1);
-        HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): pre-polling took: " << pre_poll.min().count() << std::endl; )
-        ham::util::time::timer t2;
+        // pre_poll.add(t1);
+        HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): pre-polling took: " << ""/*pre_poll.min().count()*/ << std::endl; )
+        // ham::util::time::timer t2;
         while (*local_flag == FLAG_FALSE); // poll on flag for completion
-        poll.add(t2);
+        // poll.add(t2);
         HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): FLAG after polling: " << (int)*local_flag << std::endl; )
-        HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): polling took: " << poll.min().count() << std::endl; )
+        HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): polling took: " << ""/*poll.min().count()*/ << std::endl; )
 
 
         if (*local_flag != NO_BUFFER_INDEX) // the flag contains the next buffer index to poll on
diff --git a/tools/install_boost.sh b/tools/install_boost.sh
index 9b91667..2a19297 100755
--- a/tools/install_boost.sh
+++ b/tools/install_boost.sh
@@ -36,7 +36,7 @@
 DOWNLOAD_PATH=$HOME/boost/
 INSTALL_PATH=$HOME/software
 NO_MIC=true # set to true, to disable building Boost for Xeon Phi
-BASHRC_FILE=$HOME/.bashrc # set to /dev/null to disable, or to any other file to manually merge the needed changes into your .bashrc 
+BASHRC_FILE=$HOME/dev/null # set to /dev/null to disable, or to any other file to manually merge the needed changes into your .bashrc 
 
 BOOST_BUILD_OPTIONS="-j8" # concurrent build with up to 8 commands
 BOOST_NAME=boost
@@ -76,7 +76,7 @@ cd tools/build
 echo "Building Boost.Build ..."
 ./bootstrap.sh > $BUILD_LOG_BB 2>&1
 echo "Installing Boost.Build ..."
-./b2 install --prefix=$BOOST_INSTALL_PATH >> $BUILD_LOG_BB 2>&1
+./b2 install --prefix=${BOOST_INSTALL_PATH} >> $BUILD_LOG_BB 2>&1
 PATH=$BOOST_INSTALL_PATH/bin:$PATH
 cd ../..
 

From 42d9303203057871c92a93dd06b44f0c1dfb1681 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Thu, 11 Oct 2018 21:51:21 +0200
Subject: [PATCH 067/150] lock-get-unlock protocol

---
 .../ham/net/communicator_mpi_rma_dynamic.hpp  | 51 ++++++++++++++-----
 1 file changed, 39 insertions(+), 12 deletions(-)

diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index f3cee5a..f73a559 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -3,8 +3,8 @@
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
-#ifndef ham_net_communicator_mpi_hpp
-#define ham_net_communicator_mpi_hpp
+#ifndef ham_net_communicator_mpi_rma_dynamic_hpp
+#define ham_net_communicator_mpi_rma_dynamic_hpp
 
 #include <mpi.h>
 
@@ -265,7 +265,8 @@ class communicator {
                 //MPI_Win_allocate(0, 1, MPI_INFO_NULL, MPI_COMM_WORLD, peers[i].flag_win_data, &(peers[i].rma_flag_win));
             }
         }
-/*
+
+/*      // no longer needed
         // initialise all windows for target -> host
         for (node_t i = 1; i < nodes_; ++i) {
             if (is_host()) {
@@ -287,7 +288,7 @@ class communicator {
             }
         }
 */
-		// get all locks to targets
+		// get all locks to targets for data
         // targets lock to other targets for copies
         for (node_t i = 0; i < nodes_; ++i) { // TODO(improvement): needs to be changed when host-rank becomes configurable
             if (i != this_node_) {
@@ -295,6 +296,8 @@ class communicator {
             }
         }
 
+        /* // locking will be done when accessing remote memory
+        // locks for active message rma transfers
         if (this_node_ != host_node_) { // targets
             MPI_Win_lock(MPI_LOCK_SHARED, 0, 0, peers[0].msg_win);
             MPI_Win_lock(MPI_LOCK_SHARED, 0, 0, peers[0].flag_win);
@@ -304,7 +307,7 @@ class communicator {
                 MPI_Win_lock(MPI_LOCK_SHARED, i, 0, peers[i].flag_win);
             }
         }
-
+        */
 
         HAM_DEBUG( HAM_LOG << "communicator::communicator(): rma window creation done" << std::endl; )
 /* pairwise COMM stuff
@@ -421,6 +424,7 @@ class communicator {
             // ham::util::time::statistics flag_put(1,0);
 
             // ham::util::time::timer t1;
+            MPI_Win_lock(MPI_LOCK_EXCLUSIVE, node, 0, peers[node].msg_win);
             MPI_Put(msg, size, MPI_BYTE, node, sizeof(msg_buffer) * buffer_index, size, MPI_BYTE, peers[node].msg_win);
             // msg_put.add(t1);
             HAM_DEBUG( HAM_LOG << "communicator::send_msg(): wrote msg" << std::endl; )
@@ -428,19 +432,23 @@ class communicator {
 
 
             // TODO DANIEL: because MPI does not guarantee order on RMA ops, there might be a FLUSH necessary here
+            // unlock includes flush, no need for it here
+            MPI_Win_unlock(node, peers[node].msg_win);
             // ham::util::time::timer t2;
-            MPI_Win_flush(node, peers[node].msg_win);
+            // MPI_Win_flush(node, peers[node].msg_win);
             // flush.add(t2);
-            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): flushed msg" << std::endl; )
-            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): flushing msg took: " << ""/*flush.min().count()*/ << std::endl; )
+            // HAM_DEBUG( HAM_LOG << "communicator::send_msg(): flushed msg" << std::endl; )
+            // HAM_DEBUG( HAM_LOG << "communicator::send_msg(): flushing msg took: " << ""/*flush.min().count()*/ << std::endl; )
 
             // write flag to target flags buffer
             // not sure on the size here?
             // ham::util::time::timer t3;
+            MPI_Win_lock(MPI_LOCK_EXCLUSIVE, node, 0, peers[node].flag_win);
             MPI_Put(&next_buffer_index, sizeof(next_buffer_index), MPI_BYTE, node, sizeof(cache_line_buffer) * buffer_index, sizeof(next_buffer_index), MPI_BYTE, peers[node].flag_win);
             // flag_put.add(t3);
             HAM_DEBUG( HAM_LOG << "communicator::send_msg(): wrote flag" << std::endl; )
             HAM_DEBUG( HAM_LOG << "communicator::send_msg(): writing flag took: " << ""/*flag_put.min().count()*/ <<std::endl; )
+            MPI_Win_unlock(node, peers[node].flag_win);
 
         } else { // to host, used by send_result
             // ham::util::time::statistics msg_put(1,0);
@@ -449,20 +457,25 @@ class communicator {
 
             size_t offset = constants::MSG_BUFFERS * this_node_;
             // ham::util::time::timer t1;
+            MPI_Win_lock(MPI_LOCK_EXCLUSIVE, node, 0, peers[node].msg_win);
             MPI_Put(msg, size, MPI_BYTE, node, sizeof(msg_buffer) * (offset + buffer_index), size, MPI_BYTE, peers[node].msg_win);
             // msg_put.add(t1);
             HAM_DEBUG( HAM_LOG << "communicator::send_msg(): wrote msg" << std::endl; )
             HAM_DEBUG( HAM_LOG << "communicator::send_msg(): writing msg took: " << ""/*msg_put.min().count()*/ << std::endl; )
+            MPI_Win_unlock(node, peers[node].msg_win);
 
             // ham::util::time::timer t2;
-            MPI_Win_flush(node, peers[node].msg_win);
+            // MPI_Win_flush(node, peers[node].msg_win);
             // flush.add(t2);
             HAM_DEBUG( HAM_LOG << "communicator::send_msg(): flushed msg" << std::endl; )
             HAM_DEBUG( HAM_LOG << "communicator::send_msg(): flushing msg took: " << ""/*flush.min().count()*/ << std::endl; )
 
             // ham::util::time::timer t3;
+            MPI_Win_lock(MPI_LOCK_EXCLUSIVE, node, 0, peers[node].flag_win);
+
             MPI_Put(&next_buffer_index, sizeof(next_buffer_index), MPI_BYTE, node, sizeof(cache_line_buffer) * (offset + buffer_index), sizeof(next_buffer_index), MPI_BYTE, peers[node].flag_win);
             // flag_put.add(t3);
+            MPI_Win_unlock(node, peers[node].flag_win);
             HAM_DEBUG( HAM_LOG << "communicator::send_msg(): wrote flag" << std::endl; )
             HAM_DEBUG( HAM_LOG << "communicator::send_msg(): writing flag took: " << ""/*flag_put.min().count()*/ <<std::endl; )
 
@@ -493,24 +506,38 @@ class communicator {
         HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): remote node is: " << node << std::endl; )
 		HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): using buffer index: " << buffer_index << std::endl; )
 
-        volatile size_t* local_flag;
+        size_t received_flag;
 
+
+        /* not needed with get
         if (this_node_ == host_node_) {
-            size_t offset = constants::MSG_BUFFERS * node;
             local_flag = reinterpret_cast<size_t*>(&peers[host_node_].flag_data.get()[offset + buffer_index]);
         } else {
             local_flag = reinterpret_cast<size_t*>(&peers[this_node_].flag_data.get()[buffer_index]);
         }
+        */
 
         HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): FLAG before polling: " << (int)*local_flag << std::endl; )
         // pre_poll.add(t1);
         HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): pre-polling took: " << ""/*pre_poll.min().count()*/ << std::endl; )
         // ham::util::time::timer t2;
-        while (*local_flag == FLAG_FALSE); // poll on flag for completion
+
+
+        // needed on host to access the memory belonging to the node from which to receive
+        size_t offset = (this_node_ == host_node_) ? constants::MSG_BUFFERS * node : 0;
+
+        while (received_flag == FLAG_FALSE) {
+            MPI_Win_lock(MPI_LOCK_EXCLUSIVE, this_node_, 0, peers[this_node_].flag_win);
+            MPI_Get(&received_flag, sizeof(size_t), MPI_BYTE, this_node_, offset , sizeof(size_t), MPI_BYTE, peers[this_node_].flag_win)
+            MPI_Win_unlock(this_node_, peers[this_node_].flag_win);
+        } // poll on flag for completion
         // poll.add(t2);
         HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): FLAG after polling: " << (int)*local_flag << std::endl; )
         HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): polling took: " << ""/*poll.min().count()*/ << std::endl; )
 
+        // make sure message window is updated locally too
+        MPI_Win_lock(MPI_LOCK_SHARED, this_node_, 0, peers[this_node_].msg_win);
+        MPI_Win_unlock(this_node_, peers[this_node_].msg_win);
 
         if (*local_flag != NO_BUFFER_INDEX) // the flag contains the next buffer index to poll on
             peers[node].next_flag = *local_flag;

From 7549843a237f8e8468002c89c146fc70d92082bd Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Sun, 21 Oct 2018 00:10:00 +0200
Subject: [PATCH 068/150] test

---
 include/ham/misc/constants.hpp                |   1 +
 include/ham/net/communicator.hpp              |   8 +-
 .../ham/net/communicator_mpi_rma_dynamic.hpp  | 237 +++++++-----------
 include/ham/offload/offload.hpp               |   2 +
 include/ham/offload/offload_msg.hpp           |   2 +
 5 files changed, 102 insertions(+), 148 deletions(-)

diff --git a/include/ham/misc/constants.hpp b/include/ham/misc/constants.hpp
index 28483a9..113fc8c 100644
--- a/include/ham/misc/constants.hpp
+++ b/include/ham/misc/constants.hpp
@@ -17,6 +17,7 @@ namespace constants {
 enum net {
 	MSG_SIZE = HAM_MESSAGE_SIZE,
 	MSG_BUFFERS = 256,
+	FLAG_SIZE = sizeof(size_t),
 };
 
 enum arch {
diff --git a/include/ham/net/communicator.hpp b/include/ham/net/communicator.hpp
index c754f99..65683c6 100644
--- a/include/ham/net/communicator.hpp
+++ b/include/ham/net/communicator.hpp
@@ -37,7 +37,13 @@ namespace net {
 	{
 		char data[constants::MSG_SIZE];
 	};
-	
+
+	struct
+	msg_flag_buffer
+	{
+		// buffer for fully rma backend. windows will consist of several of those
+		char data[constants::MSG_SIZE + constants::FLAG_SIZE];
+	};
 	
 	node_t this_node();
 }
diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index f73a559..08d7815 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -104,13 +104,13 @@ class communicator {
 
             HAM_DEBUG( HAM_LOG << "request::get(), before MPI_Waitall()" << std::endl; )
 			MPI_Waitall(req_count, mpi_reqs, MPI_STATUS_IGNORE); // must wait for all requests to satisfy the standard
-			HAM_DEBUG( HAM_LOG << "request::get(), after MPI_Waitall()" << std::endl; )
-            if(uses_rma_)
-            {
-                MPI_Win_flush(target_node, communicator::instance().peers[target_node].rma_data_win);
-                // this is just a dummy return, there is no reply from the target for rma data transfers
-                // TODO, Daniel - design decision on what to return here
-                return static_cast<void*>(&communicator::instance().peers[communicator::this_node()].msg_data[local_buffer_index]);
+            // for async get from receive_data_async() this will block until get is completed
+            HAM_DEBUG( HAM_LOG << "request::get(), after MPI_Waitall()" << std::endl; )
+
+            if(uses_rma_)  {
+                // this will only be true for async rma data transfers
+                // there will be no result returned, so this won't poll on anything and return a dummy instead.
+                return nullptr;
             } else {
                 return communicator::instance().recv_msg(target_node, local_buffer_index);
             }
@@ -223,19 +223,27 @@ class communicator {
             }
         }*/
 
-        // initialise all windows
+        // initialise data windows
         for (node_t i = 0; i < nodes_; ++i) {
             // dynamic data window
             MPI_Win_create_dynamic(MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[i].rma_data_win));
+        }
+
+        // initialise message windows
+        for (node_t i = 0; i < nodes_; ++i) { // loop through ranks
 
             if (i == this_node_) { // create local windows with allocated memory for targets, host creates one inbound set of windows for all targets
-                // allocate memory
-                if (this_node_ == host_node_) {
-                    // MSG_SIZE/FLAG_SIZE * MSG_BUFFERS * num_nodes for host
-                    peers[this_node_].msg_data = allocate_peer_buffer<msg_buffer>(constants::MSG_BUFFERS * nodes_, this_node_);
-                    peers[this_node_].flag_data = allocate_peer_buffer<cache_line_buffer>(constants::MSG_BUFFERS * nodes_, this_node_);
-                    reset_flags(peers[this_node_].flag_data, constants::MSG_BUFFERS * nodes_);
-                    // fill resource pools
+
+                // allocate memory and create windows
+                if (this_node_ == host_node_) { // host creates one large window with subsets associated with different targets
+
+                    // (MSG_SIZE+FLAG_SIZE) * MSG_BUFFERS * num_nodes = bytes of memory allocated (sizes are implicit in msg_flag_buffer struct)
+                    peers[this_node_].msg_flag_data = allocate_peer_buffer<msg_flag_buffer>(constants::MSG_BUFFERS * nodes_, this_node_);
+                    // peers[this_node_].flag_data = allocate_peer_buffer<cache_line_buffer>(constants::MSG_BUFFERS * nodes_, this_node_);
+                    // set flags to FLAG_FALSE
+                    reset_flags(peers[this_node_].msg_flag_data, constants::MSG_BUFFERS * nodes_); // TODO: Daniel - this may be bad if buffer structs are not contiguos - check
+
+                    // fill resource pools for managing indices on the host
                     for (size_t j = 0; j < nodes_; ++j) {
                         for (size_t k = constants::MSG_BUFFERS; k > 0; --k) {
                             // target buffers
@@ -245,49 +253,37 @@ class communicator {
                         // allocate first next_request,
                         allocate_next_request(j);
                     }
-                } else {
-                    // MSG_SIZE/FLAG_SIZE * MSG_BUFFERS for targets
-                    peers[this_node_].msg_data = allocate_peer_buffer<msg_buffer>(constants::MSG_BUFFERS, this_node_);
-                    peers[this_node_].flag_data = allocate_peer_buffer<cache_line_buffer>(constants::MSG_BUFFERS, this_node_);
+                    // create window with memory
+                    MPI_Win_create((peers[this_node_].msg_flag_data.get()), sizeof(msg_flag_buffer) * constants::MSG_BUFFERS * nodes_, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[this_node_].msg_flag_win));
+                    // MPI_Win_create((peers[this_node_].flag_data.get()), sizeof(cache_line_buffer) * constants::MSG_BUFFERS * nodes_, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[this_node_].flag_win));
+
+                } else { // targets create one window with the size of their msg "queue"
+                    // (MSG_SIZE+FLAG_SIZE) * MSG_BUFFERS = bytes of memory allocated (sizes are implicit in msg_flag_buffer struct)
+                    peers[this_node_].msg_flag_data = allocate_peer_buffer<msg_flag_buffer>(constants::MSG_BUFFERS, this_node_);
+                    // peers[this_node_].flag_data = allocate_peer_buffer<cache_line_buffer>(constants::MSG_BUFFERS, this_node_);
+                    // set flags to FLAG_FALSE
                     reset_flags(peers[this_node_].flag_data, constants::MSG_BUFFERS);
+
+                    // create window with memory
+                    MPI_Win_create((peers[this_node_].msg_flag_data.get()), sizeof(msg_buffer) * constants::MSG_BUFFERS, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[this_node_].msg_flag_win));
+                    // MPI_Win_create((peers[this_node_].flag_data.get()), sizeof(cache_line_buffer) * constants::MSG_BUFFERS, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[this_node_].flag_win));
                 }
 
-                // create windows
-                MPI_Win_create((peers[this_node_].msg_data.get()), sizeof(msg_buffer) * constants::MSG_BUFFERS * nodes_, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[this_node_].msg_win));
-                MPI_Win_create((peers[this_node_].flag_data.get()), sizeof(cache_line_buffer) * constants::MSG_BUFFERS * nodes_, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[this_node_].flag_win));
+                // debug msg
+                HAM_DEBUG( std::cout << "Rank: " << this_node_ << " in loop run " << i << " created REAL windows..." << std::endl; )
+
 
-            } else { //create remote windows without memory
-                void* dump;
-                MPI_Win_create(dump, 0, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[i].msg_win));
-                MPI_Win_create(dump, 0, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[i].flag_win));
+            } else { // create remote windows without memory (join the collective call and retreive the window handle)
 
+                MPI_Win_create(nullptr, 0, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[i].msg_flag_win));
+                // MPI_Win_create(nullptr, 0, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[i].flag_win));
+                // debug msg
+                HAM_DEBUG( std::cout << "Rank: " << this_node_ << " in loop run " << i << " creating EMPTY windows..." << std::endl; )
                 //MPI_Win_allocate(0, 1, MPI_INFO_NULL, MPI_COMM_WORLD, peers[i].msg_win_data, &(peers[i].rma_msg_win));
                 //MPI_Win_allocate(0, 1, MPI_INFO_NULL, MPI_COMM_WORLD, peers[i].flag_win_data, &(peers[i].rma_flag_win));
             }
         }
 
-/*      // no longer needed
-        // initialise all windows for target -> host
-        for (node_t i = 1; i < nodes_; ++i) {
-            if (is_host()) {
-                // create local wins with memory for all targets
-                // allocate memory
-
-
-                // create window
-                MPI_Win_create(memptr, SIZE, MPI_BYTE, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[i].local_msg_win));
-                MPI_Win_create(memptr, SIZE, MPI_BYTE, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[i].local_flag_win));
-
-            } else {
-                // create remote wins without memory for host
-                if (i == this_node_) {
-                    MPI_Win_create(memptr, 0, MPI_BYTE, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[host_node_].local_msg_win));
-                    MPI_Win_create(memptr, 0, MPI_BYTE, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[host_node_].local_msg_win));
-
-                }
-            }
-        }
-*/
 		// get all locks to targets for data
         // targets lock to other targets for copies
         for (node_t i = 0; i < nodes_; ++i) { // TODO(improvement): needs to be changed when host-rank becomes configurable
@@ -296,6 +292,9 @@ class communicator {
             }
         }
 
+        // MPI_Barrier(MPI_COMM_WORLD);
+
+
         /* // locking will be done when accessing remote memory
         // locks for active message rma transfers
         if (this_node_ != host_node_) { // targets
@@ -309,33 +308,8 @@ class communicator {
         }
         */
 
-        HAM_DEBUG( HAM_LOG << "communicator::communicator(): rma window creation done" << std::endl; )
-/* pairwise COMM stuff
-       // both
-                // prepare global group to create pairwise groups
-                MPI_Comm_group(MPI_COMM_WORLD, &global_group);
-       // host
- 				// init comm to target from pairwise subgroups
- 				const int members[2] = {host_node_, i}; // NOTE: this implies new group rank is 0 for host, 1 for target
- 				MPI_Group pairwise_group;
- 				MPI_Group_incl(global_group, 2, members, &pairwise_group);
- 				MPI_Comm_create_group(MPI_COMM_WORLD, pairwise_group, 0, &(peers[i].rma_comm));
- 				MPI_Group_free(&pairwise_group); // no longer needed after COMM is created
-
- 				// init win to target
- 				MPI_Win_create_dynamic(MPI_INFO_NULL, peers[i].rma_comm, &(peers[i].rma_data_win));
-       // targets
- 			    // init comm to host from pairwise subgroup
- 			    const int members[2] = {host_node_, this_node_}; // NOTE: this implies new group rank = 0 for host, 1 for target
- 			    MPI_Group pairwise_group;
- 			    MPI_Group_incl(global_group, 2, members, &pairwise_group); // should match the corresponding subgroup on host for i = this_node_
- 			    MPI_Comm_create_group(MPI_COMM_WORLD, pairwise_group, 0, &(peers[host_node_].rma_comm));
- 			    MPI_Group_free(&pairwise_group); // no longer needed after COMM is created
-
- 			    // init win to host
- 			    MPI_Win_create_dynamic(MPI_INFO_NULL, peers[host_node_].rma_comm, &(peers[host_node_].rma_data_win));
- */
-
+        HAM_DEBUG( HAM_LOG << "communicator::communicator(): rma window creation completed" << std::endl; )
+        HAM_DEBUG( HAM_LOG << "communicator::communicator(): communicator initialization completed" << std::endl; )
 	}
 
 	~communicator()
@@ -344,20 +318,20 @@ class communicator {
 		HAM_DEBUG( HAM_LOG << "~communicator" << std::endl; )
 	}
 
-    // this is only used by the host
+    // this is only used by the host to manage remote msg buffers and local reply buffers and assign them to requests
     const request& allocate_next_request(node_t remote_node)
     {
-        // this allocates a host-managed index for the remote nodes msg and flag buffers
+        // this allocates a host-managed index for the remote nodes msg "queue"
         // so the host knows which buffers are available on the target
         const size_t remote_buffer_index = peers[remote_node].remote_buffer_pool.allocate();
-        // this allocates an index for the hosts large msg and flag buffers
-        // request is included in offload message, so target knows into which buffers answers must be written
+        // this allocates an index in the hosts "reply queue"
+        // request is included in offload message, so the target knows into which buffers replys must be written
         // when used, the index will need to be added to an offset determined by a targets rank to address the part of the buffer belonging to this target
         // NOTE: the actual host buffer is stored at the hosts peers[0], but the buffer_pools are stored at the corresponding peers[target]
         // buffer_pools manage idices within the targets section of the hosts buffer
         const size_t local_buffer_index = peers[remote_node].local_buffer_pool.allocate();
 
-        peers[remote_node].next_request = { remote_node, this_node_, remote_buffer_index, local_buffer_index};
+        peers[remote_node].next_request = {remote_node, this_node_, remote_buffer_index, local_buffer_index};
 
         return peers[remote_node].next_request;
     }
@@ -370,7 +344,8 @@ class communicator {
 		return peers[remote_node].next_request;
 	}
 
-    // used for async rma data transfers, so they wont take up buffer indices they dont need
+    // used for rma data transfers, so they wont take up unneeded buffer indices
+    // only put() and get() use this, copy() offloads an active msg to the data source and therefore uses allocate_request()
     request allocate_data_request(node_t remote_node) {
         HAM_DEBUG( HAM_LOG << "communicator::allocate_next_request(): remote_node = " << remote_node << std::endl; )
         return { remote_node, this_node_, NO_BUFFER_INDEX, NO_BUFFER_INDEX };
@@ -390,12 +365,13 @@ class communicator {
         mpi_peer& peer = peers[req.target_node];
 
 
-        // set flags to false
-        // local flag inside large host flag buffer @ peers[host]
+        // set flag for buffer indices associated with request to false
+        // local flag is inside the hosts large array of msg_flag_buffers @ peers[host]
         // index offset computed using target node
-        size_t offset = constants::MSG_BUFFERS * req.target_node;
-        volatile size_t* local_flag = reinterpret_cast<size_t*>(&peers[host_node_].flag_data.get()[offset + req.local_buffer_index]);
-        *local_flag= FLAG_FALSE;
+        // TODO: Daniel - figure out access to flag memory
+        size_t offset = constants::MSG_BUFFERS * req.target_node; // offset msg_flag_buffers to the corresponding nodes region
+        volatile size_t* local_flag = reinterpret_cast<size_t*>(&peers[host_node_].msg_flag_data.get()[offset + req.local_buffer_index]); // this will point to the beginning of a msg_flag_buffer
+        *local_flag = FLAG_FALSE;
         // remote flag on target
         /* This is done by the target after having reveived the new index to poll on
         size_t remote_flag = FLAG_FALSE;
@@ -455,7 +431,9 @@ class communicator {
             // ham::util::time::statistics flush(1,0);
             // ham::util::time::statistics flag_put(1,0);
 
+            // compute offset in the hosts window
             size_t offset = constants::MSG_BUFFERS * this_node_;
+            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): using msg host-offset (bytes): " << offset*sizeof(msg_buffer) << std::endl; )
             // ham::util::time::timer t1;
             MPI_Win_lock(MPI_LOCK_EXCLUSIVE, node, 0, peers[node].msg_win);
             MPI_Put(msg, size, MPI_BYTE, node, sizeof(msg_buffer) * (offset + buffer_index), size, MPI_BYTE, peers[node].msg_win);
@@ -471,8 +449,8 @@ class communicator {
             HAM_DEBUG( HAM_LOG << "communicator::send_msg(): flushing msg took: " << ""/*flush.min().count()*/ << std::endl; )
 
             // ham::util::time::timer t3;
+            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): using flag host-offset (bytes): " << offset*sizeof(cache_line_buffer) << std::endl; )
             MPI_Win_lock(MPI_LOCK_EXCLUSIVE, node, 0, peers[node].flag_win);
-
             MPI_Put(&next_buffer_index, sizeof(next_buffer_index), MPI_BYTE, node, sizeof(cache_line_buffer) * (offset + buffer_index), sizeof(next_buffer_index), MPI_BYTE, peers[node].flag_win);
             // flag_put.add(t3);
             MPI_Win_unlock(node, peers[node].flag_win);
@@ -506,43 +484,50 @@ class communicator {
         HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): remote node is: " << node << std::endl; )
 		HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): using buffer index: " << buffer_index << std::endl; )
 
-        size_t received_flag;
 
+        size_t *local_flag;
+        size_t received_flag = FLAG_FALSE;
+
+		// needed on host to access the memory belonging to the node from which to receive
+		size_t offset = (this_node_ == host_node_) ? constants::MSG_BUFFERS * node : 0;
 
-        /* not needed with get
         if (this_node_ == host_node_) {
             local_flag = reinterpret_cast<size_t*>(&peers[host_node_].flag_data.get()[offset + buffer_index]);
         } else {
             local_flag = reinterpret_cast<size_t*>(&peers[this_node_].flag_data.get()[buffer_index]);
         }
-        */
+
 
         HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): FLAG before polling: " << (int)*local_flag << std::endl; )
         // pre_poll.add(t1);
         HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): pre-polling took: " << ""/*pre_poll.min().count()*/ << std::endl; )
         // ham::util::time::timer t2;
+		HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): polling at offset (bytes): " << offset * sizeof(cache_line_buffer) << ""/*pre_poll.min().count()*/ << std::endl; )
 
 
-        // needed on host to access the memory belonging to the node from which to receive
-        size_t offset = (this_node_ == host_node_) ? constants::MSG_BUFFERS * node : 0;
 
         while (received_flag == FLAG_FALSE) {
             MPI_Win_lock(MPI_LOCK_EXCLUSIVE, this_node_, 0, peers[this_node_].flag_win);
-            MPI_Get(&received_flag, sizeof(size_t), MPI_BYTE, this_node_, offset , sizeof(size_t), MPI_BYTE, peers[this_node_].flag_win)
+            MPI_Get(&received_flag, sizeof(cache_line_buffer), MPI_BYTE, this_node_, (offset + buffer_index) * sizeof(cache_line_buffer) , sizeof(cache_line_buffer), MPI_BYTE, peers[this_node_].flag_win);
             MPI_Win_unlock(this_node_, peers[this_node_].flag_win);
         } // poll on flag for completion
         // poll.add(t2);
         HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): FLAG after polling: " << (int)*local_flag << std::endl; )
         HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): polling took: " << ""/*poll.min().count()*/ << std::endl; )
 
+        // reset the flag (thanks mpi for requiring me to get a lock for that again...
+        MPI_Win_lock(MPI_LOCK_EXCLUSIVE, this_node_, 0, peers[this_node_].flag_win);
+        *local_flag = FLAG_FALSE;
+        MPI_Win_unlock(this_node_, peers[this_node_].flag_win);
+
         // make sure message window is updated locally too
         MPI_Win_lock(MPI_LOCK_SHARED, this_node_, 0, peers[this_node_].msg_win);
         MPI_Win_unlock(this_node_, peers[this_node_].msg_win);
 
-        if (*local_flag != NO_BUFFER_INDEX) // the flag contains the next buffer index to poll on
-            peers[node].next_flag = *local_flag;
+        if (received_flag != NO_BUFFER_INDEX) // the flag contains the next buffer index to poll on
+            peers[node].next_flag = received_flag;
 
-        *local_flag = FLAG_FALSE;
+        HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): done " << ""/*poll.min().count()*/ << std::endl; )
 
         if (this_node_ == host_node_) {
             size_t offset = constants::MSG_BUFFERS * node;
@@ -566,14 +551,16 @@ class communicator {
 	void recv_result(request_reference_type req)
 	{
 		// nothing todo here, since this communicator implementation uses one-sided communication
-		// the data is already where it is expected (in the buffer referenced in req)
+		// the data will be written to where it is expected
 
         // MPI_Irecv(static_cast<void*>(&peers[req.target_node].msg_buffers[req.recv_buffer_index]), constants::MSG_SIZE, MPI_BYTE, req.target_node, constants::RESULT_TAG, MPI_COMM_WORLD, &req.next_mpi_request());
 		return;
 	}
 
+    // only used by the host
     bool test_local_flag(node_t node, size_t buffer_index)
     {
+        size_t offset = (constants::MSG_SIZE + constants::FLAG_SIZE) * node;
         volatile size_t * local_flag = reinterpret_cast<size_t*>(&peers[node].flag_data.get()[buffer_index]);
         return *local_flag != FLAG_FALSE;
     }
@@ -597,10 +584,8 @@ class communicator {
 	void send_data(T* local_source, buffer_ptr<T> remote_dest, size_t size)
 	{
 		// execute transfer
-		// MPI_Win_lock(MPI_LOCK_SHARED, remote_dest.node(), 0, peers[remote_dest.node()].rma_data_win); // not needed since all ranks have locks on all targets
         MPI_Put(local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), remote_dest.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_dest.node()].rma_data_win);
         MPI_Win_flush(remote_dest.node(), peers[remote_dest.node()].rma_data_win);
-		// MPI_Win_unlock(remote_dest.node(), peers[remote_dest.node()].rma_data_win);
 	}
 
 	// to be used by the host only
@@ -609,7 +594,6 @@ class communicator {
 	{
         req.uses_rma_ = true;
 
-        // MPI_Win_lock(MPI_LOCK_SHARED, remote_dest.node(), 0, peers[remote_dest.node()].rma_data_win);
         MPI_Rput(local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), remote_dest.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_dest.node()].rma_data_win, &req.next_mpi_request());
 	}
 
@@ -620,10 +604,8 @@ class communicator {
 	template<typename T>
 	void recv_data(buffer_ptr<T> remote_source, T* local_dest, size_t size)
 	{
-		// MPI_Win_lock(MPI_LOCK_SHARED, remote_source.node(), 0, peers[remote_source.node()].rma_data_win);
 		MPI_Get(local_dest, size * sizeof(T), MPI_BYTE, remote_source.node(), remote_source.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_source.node()].rma_data_win);
 		MPI_Win_flush(remote_source.node(), peers[remote_source.node()].rma_data_win);
-		// MPI_Win_unlock(remote_source.node(), peers[remote_source.node()].rma_data_win);
 	}
 	
 	// to be used by the host
@@ -632,7 +614,6 @@ class communicator {
 	{
         req.uses_rma_ = true;
 
-		// MPI_Win_lock(MPI_LOCK_SHARED, remote_source.node(), 0, peers[remote_source.node()].rma_data_win);
 		MPI_Rget(local_dest, size * sizeof(T), MPI_BYTE, remote_source.node(), remote_source.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_source.node()].rma_data_win, &req.next_mpi_request());
 	}
 
@@ -642,12 +623,11 @@ class communicator {
 		T* ptr;
 		//int err =
 		posix_memalign((void**)&ptr, constants::CACHE_LINE_SIZE, n * sizeof(T));
+
         // attach to own window
         HAM_DEBUG( HAM_LOG << "communicator::allocate_buffer(), allocating buffer @: " << (long)ptr << std::endl; )
         MPI_Win_attach(peers[this_node_].rma_data_win, (void*)ptr, n * sizeof(T));
-        /* for (node_t i = 1; i < nodes_; ++i) { // nonsense, all accesses to a rank will only take place on that targets window, no need to attach to other
-            MPI_Win_attach(peers[i].rma_data_win, (void*)ptr, n * sizeof(T));
-        } */
+
 		MPI_Aint mpi_address;
 		MPI_Get_address((void*)ptr, &mpi_address);
 		// NOTE: no ctor is called
@@ -658,13 +638,10 @@ class communicator {
 	template<typename T>
 	buffer_ptr<T> allocate_peer_buffer(const size_t n, node_t source_node)
 	{
-        // TODO DANIEL: this is where mem is allocated that should be mapped to static mpi windows
         T* ptr;
-		//int err =
 		posix_memalign((void**)&ptr, constants::CACHE_LINE_SIZE, n * sizeof(T));
 		// NOTE: no ctor is called
 		return buffer_ptr<T>(ptr, this_node_);
-
 	}
 
     // used for data buffers only
@@ -704,36 +681,6 @@ class communicator {
 		return instance().node_descriptions[node];
 	}
 
-/*
-	// called to check if an rma path between two targets exists, sufficient to call on one of the two targets
-	bool has_rma_path(node_t target_node) {
-		// check if copy path exists
-		return !peers[remote_dest.node()].rma_data_win;
-	}
-*/
-/*
-	// called to establish an rma path between two targets for copy operations, needs to be called on both sides
-	void establish_rma_path(node_t target_node) {
-		if(!has_rma_path(target_node)) { // make sure there is not already an rma path
-			const int members[2];
-			// NOTE: protocol for target-target sub-ranks is: lower global rank: 0, higher global rank: 1
-			// thus rank for existing copy paths can be easily translated by comparing target rank to own rank
-			if(this_node_ > target_node) {
-				members[0] = target_node;
-				members[1] = this_node_;
-			} else {
-				members[0] = this_node_;
-				members[1] = target_node;
-			}
-			MPI_Group pairwise_group;
-			MPI_Group_incl(global_group, 2, members, &pairwise_group);
-			MPI_Comm_create_group(MPI_COMM_WORLD, pairwise_group, 0, &(peers[target_node].rma_comm));
-			MPI_Group_free(&pairwise_group); // no longer needed after COMM is created
-			MPI_Win_create_dynamic(MPI_INFO_NULL, peers[target_node].rma_comm, &(peers[target_node].rma_data_win));
-		}
-	}
-*/
-
 private:
 	static communicator* instance_;
 	node_t this_node_;
@@ -751,15 +698,11 @@ class communicator {
 
         request next_request;
         size_t next_flag = 0;
+        // NOTE: behind these buffers are MSG_BUFFERS many buffers of size MSG_SIZE+FLAG_SIZE, indices are managed by buffer_pool
 
-        // NOTE: behind these buffers are MSG_BUFFERS many buffers of size MSG_SIZE/CACHE_LINE_SIZE, indices are managed by buffer_pool
-
-        // static window for inbound rma messages
-        buffer_ptr<msg_buffer> msg_data;
-        MPI_Win msg_win;
-        // static window for inbound message flags
-        buffer_ptr<cache_line_buffer> flag_data;
-        MPI_Win flag_win;
+        // static window for inbound rma messages and their flags
+        buffer_ptr<msg_flag_buffer> msg_flag_data;
+        MPI_Win msg_flag_win;
 
 		// mpi rma dynamic window for data
 		MPI_Win rma_data_win;
diff --git a/include/ham/offload/offload.hpp b/include/ham/offload/offload.hpp
index a315c50..0148e7f 100644
--- a/include/ham/offload/offload.hpp
+++ b/include/ham/offload/offload.hpp
@@ -270,6 +270,7 @@ future<void> get(buffer_ptr<T> remote_source, T* local_dest, size_t n)
 	comm.send_msg(result.get_request(), (void*)&msg, sizeof msg);
 	comm.recv_data_async(result.get_request(), remote_source, local_dest, n);
 	comm.recv_result(result.get_request()); // trigger receiving the result
+	// TODO(improvement): the recv_result() is not needed, could remove and remove send_result() from offload_read_msg to reduce synchronization overhead
 
 	return result;
 #elif defined HAM_COMM_MPI_RMA_DYNAMIC
@@ -355,6 +356,7 @@ void copy_sync(buffer_ptr<T> source, buffer_ptr<T> dest, size_t n)
 	comm.recv_result(write_result.get_request()); // trigger receiving the msg result // async
 	
 	// synchronise
+	// TODO(improvement): this is oversynchronized, waiting for the target to complete receiving should be sufficient
 	read_result.get();
 	write_result.get();
 #elif defined HAM_COMM_MPI_RMA_DYNAMIC
diff --git a/include/ham/offload/offload_msg.hpp b/include/ham/offload/offload_msg.hpp
index 97c5e95..6e709a4 100644
--- a/include/ham/offload/offload_msg.hpp
+++ b/include/ham/offload/offload_msg.hpp
@@ -95,6 +95,7 @@ class offload_write_msg
 		communicator::instance().recv_data(buffer_ptr<T>(nullptr, remote_node), local_dest, n); // NOTE: Why nullptr? This is for two-sided communicators, so we do not know the remote address, but match a send operation that has the address.
 
 		// send a result to tell the sender, that the transfer is done
+        // TODO(improvement): this may be
 		if (req.valid()) {
 			req.send_result((void*)&n, sizeof n);
 		}
@@ -122,6 +123,7 @@ class offload_read_msg
 		communicator::instance().send_data(local_source, buffer_ptr<T>(nullptr, remote_node), n); // NOTE: Why nullptr? This is for two-sided communicators, so we do not know the remote address, but match a receive operation that has the address.
 		
 		// send a result message to tell the sender, that the transfer is done
+        // TODO(improvement): this may be removed along with receiving the result in offload get()
 		if (req.valid()) {
 			req.send_result((void*)&n, sizeof n);
 		}

From 1a907970c7d221868f45c2556a570db45f400d22 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Sun, 28 Oct 2018 00:18:10 +0200
Subject: [PATCH 069/150] functional version, lots of changes

---
 include/ham/misc/constants.hpp                |   3 +-
 include/ham/misc/types.hpp                    |   1 +
 include/ham/net/communicator.hpp              |   3 +-
 .../ham/net/communicator_mpi_rma_dynamic.hpp  | 334 ++++++++----------
 4 files changed, 161 insertions(+), 180 deletions(-)

diff --git a/include/ham/misc/constants.hpp b/include/ham/misc/constants.hpp
index 113fc8c..b37c690 100644
--- a/include/ham/misc/constants.hpp
+++ b/include/ham/misc/constants.hpp
@@ -17,7 +17,8 @@ namespace constants {
 enum net {
 	MSG_SIZE = HAM_MESSAGE_SIZE,
 	MSG_BUFFERS = 256,
-	FLAG_SIZE = sizeof(size_t),
+	DATA_PUT_CODE = 1,
+	DATA_GET_CODE = 2,
 };
 
 enum arch {
diff --git a/include/ham/misc/types.hpp b/include/ham/misc/types.hpp
index 458eefc..1b8393d 100644
--- a/include/ham/misc/types.hpp
+++ b/include/ham/misc/types.hpp
@@ -12,6 +12,7 @@
 namespace ham {
 
 typedef size_t node_t; // node type, e.g. MPI rank, identifies remote target process
+typedef size_t flag_t; // MPI RMA completion flag / buffer index
 typedef char*  msg_buffer_t; // buffer type for messages
 
 namespace net {
diff --git a/include/ham/net/communicator.hpp b/include/ham/net/communicator.hpp
index 65683c6..a0a6164 100644
--- a/include/ham/net/communicator.hpp
+++ b/include/ham/net/communicator.hpp
@@ -42,7 +42,8 @@ namespace net {
 	msg_flag_buffer
 	{
 		// buffer for fully rma backend. windows will consist of several of those
-		char data[constants::MSG_SIZE + constants::FLAG_SIZE];
+		char msg[constants::MSG_SIZE];
+		flag_t flag;
 	};
 	
 	node_t this_node();
diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index 08d7815..4afd7f5 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -77,40 +77,39 @@ class communicator {
 		request() : valid_(false) {} // instantiate invalid
 		
 		request(node_t target_node, node_t source_node, size_t remote_buffer_index, size_t local_buffer_index)
-		 : target_node(target_node), source_node(source_node), valid_(true), remote_buffer_index(remote_buffer_index), local_buffer_index(local_buffer_index), req_count(0), uses_rma_(false)
+		 : target_node(target_node), source_node(source_node), valid_(true), remote_buffer_index(remote_buffer_index), local_buffer_index(local_buffer_index), req_count(0), data_transfer_type(0)
 		{}
 
 		// return true if request was finished
         // will not work as intended for rma ops, no equivalent to test() available for remote completion
 		bool test()
 		{
-			// int flag = 0;
-
-            // MPI_Testall(req_count, mpi_reqs, &flag, MPI_STATUS_IGNORE); // just test the receive request, since the send belonging to the request triggers the remote send that is received
-
-            /*
-            if(uses_rma_)
-            {
-                HAM_DEBUG( HAM_LOG << "request::test(), warning: may give false positive on rma remote completion" << std::endl; )
+            if(data_transfer_type) { // this will be true for rma data transfers
+                int flag = 0;
+                MPI_Testall(req_count, mpi_reqs, &flag, MPI_STATUS_IGNORE); // test on RGET is what we want, because local completion = full completion for get, but for RPut local is not enough and there is no non-blocking remote-completion test
+                HAM_DEBUG( HAM_LOG << "request::test(), warning: may give false positive on rma put remote completion" << std::endl; )
+                // TODO - Daniel: this is bad but MPI RMA doesn't have anything better
+                // TODO - Daniel: discuss preliminary design decision with Matthias: false positive + longer block = better than false negative as users may poll on this and get stuck
+                return flag != 0;
             }
-
-            return flag != 0;
-            */
             return communicator::instance().test_local_flag(target_node, local_buffer_index);
 		}
 
 		void* get() // blocks
 		{
-
-            HAM_DEBUG( HAM_LOG << "request::get(), before MPI_Waitall()" << std::endl; )
-			MPI_Waitall(req_count, mpi_reqs, MPI_STATUS_IGNORE); // must wait for all requests to satisfy the standard
-            // for async get from receive_data_async() this will block until get is completed
-            HAM_DEBUG( HAM_LOG << "request::get(), after MPI_Waitall()" << std::endl; )
-
-            if(uses_rma_)  {
-                // this will only be true for async rma data transfers
+            if(data_transfer_type)  {
+                HAM_DEBUG( HAM_LOG << "request::get(), before MPI_Waitall()" << std::endl; )
+                MPI_Waitall(req_count, mpi_reqs, MPI_STATUS_IGNORE); // Get will have fully completed
+                HAM_DEBUG( HAM_LOG << "request::get(), after MPI_Waitall()" << std::endl; )
+                if(data_transfer_type == constants::DATA_PUT_CODE) {
+                    HAM_DEBUG( HAM_LOG << "request::get(), before MPI_Win_flush()" << std::endl; )
+                    communicator::instance().flush_data(target_node);
+                    HAM_DEBUG( HAM_LOG << "request::get(), after MPI_Win_flush()" << std::endl; )
+                }
+                // this will only be true for async rma GETs
                 // there will be no result returned, so this won't poll on anything and return a dummy instead.
                 return nullptr;
+                // TODO - Daniel: this is bad but MPI RMA doesn't have anything better
             } else {
                 return communicator::instance().recv_msg(target_node, local_buffer_index);
             }
@@ -131,9 +130,9 @@ class communicator {
 			return valid_;
 		}
 
-        bool uses_rma() const
+        bool is_rma_data_transfer() const
         {
-            return uses_rma_;
+            return data_transfer_type;
         }
 
 		MPI_Request& next_mpi_request()
@@ -146,7 +145,7 @@ class communicator {
 		node_t target_node;
 		node_t source_node;
 		bool valid_;
-        bool uses_rma_;
+        short data_transfer_type;
 
 		// only needed by the sender
 		enum { NUM_REQUESTS = 3 };
@@ -241,7 +240,7 @@ class communicator {
                     peers[this_node_].msg_flag_data = allocate_peer_buffer<msg_flag_buffer>(constants::MSG_BUFFERS * nodes_, this_node_);
                     // peers[this_node_].flag_data = allocate_peer_buffer<cache_line_buffer>(constants::MSG_BUFFERS * nodes_, this_node_);
                     // set flags to FLAG_FALSE
-                    reset_flags(peers[this_node_].msg_flag_data, constants::MSG_BUFFERS * nodes_); // TODO: Daniel - this may be bad if buffer structs are not contiguos - check
+                    reset_flags(peers[this_node_].msg_flag_data, constants::MSG_BUFFERS * nodes_); // structs are contiguos, this is ok
 
                     // fill resource pools for managing indices on the host
                     for (size_t j = 0; j < nodes_; ++j) {
@@ -262,7 +261,7 @@ class communicator {
                     peers[this_node_].msg_flag_data = allocate_peer_buffer<msg_flag_buffer>(constants::MSG_BUFFERS, this_node_);
                     // peers[this_node_].flag_data = allocate_peer_buffer<cache_line_buffer>(constants::MSG_BUFFERS, this_node_);
                     // set flags to FLAG_FALSE
-                    reset_flags(peers[this_node_].flag_data, constants::MSG_BUFFERS);
+                    reset_flags(peers[this_node_].msg_flag_data, constants::MSG_BUFFERS);
 
                     // create window with memory
                     MPI_Win_create((peers[this_node_].msg_flag_data.get()), sizeof(msg_buffer) * constants::MSG_BUFFERS, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[this_node_].msg_flag_win));
@@ -365,15 +364,16 @@ class communicator {
         mpi_peer& peer = peers[req.target_node];
 
 
-        // set flag for buffer indices associated with request to false
+        // reset local flag
         // local flag is inside the hosts large array of msg_flag_buffers @ peers[host]
         // index offset computed using target node
-        // TODO: Daniel - figure out access to flag memory
+        // as this is an access to rma window memory, we need to lock again...
+        MPI_Win_lock(MPI_LOCK_EXCLUSIVE, this_node_, 0, peers[this_node_].msg_flag_win);
         size_t offset = constants::MSG_BUFFERS * req.target_node; // offset msg_flag_buffers to the corresponding nodes region
-        volatile size_t* local_flag = reinterpret_cast<size_t*>(&peers[host_node_].msg_flag_data.get()[offset + req.local_buffer_index]); // this will point to the beginning of a msg_flag_buffer
-        *local_flag = FLAG_FALSE;
+        peers[this_node_].msg_flag_data.get()[offset + req.local_buffer_index].flag = FLAG_FALSE;
+        MPI_Win_unlock(this_node_, peers[this_node_].msg_flag_win);
         // remote flag on target
-        /* This is done by the target after having reveived the new index to poll on
+        /* This is done by the target after having received the new index to poll on
         size_t remote_flag = FLAG_FALSE;
         MPI_Put(&remote_flag, sizeof(remote_flag), MPI_BYTE, req.target_node, 0, sizeof(remote_flag), MPI_BYTE, peer.flag_win);
         // flush? don't think so
@@ -400,31 +400,30 @@ class communicator {
             // ham::util::time::statistics flag_put(1,0);
 
             // ham::util::time::timer t1;
-            MPI_Win_lock(MPI_LOCK_EXCLUSIVE, node, 0, peers[node].msg_win);
-            MPI_Put(msg, size, MPI_BYTE, node, sizeof(msg_buffer) * buffer_index, size, MPI_BYTE, peers[node].msg_win);
-            // msg_put.add(t1);
-            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): wrote msg" << std::endl; )
-            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): writing msg took: " << ""/*msg_put.min().count()*/ << std::endl; )
-
-
-            // TODO DANIEL: because MPI does not guarantee order on RMA ops, there might be a FLUSH necessary here
-            // unlock includes flush, no need for it here
-            MPI_Win_unlock(node, peers[node].msg_win);
-            // ham::util::time::timer t2;
-            // MPI_Win_flush(node, peers[node].msg_win);
-            // flush.add(t2);
-            // HAM_DEBUG( HAM_LOG << "communicator::send_msg(): flushed msg" << std::endl; )
-            // HAM_DEBUG( HAM_LOG << "communicator::send_msg(): flushing msg took: " << ""/*flush.min().count()*/ << std::endl; )
-
-            // write flag to target flags buffer
-            // not sure on the size here?
-            // ham::util::time::timer t3;
-            MPI_Win_lock(MPI_LOCK_EXCLUSIVE, node, 0, peers[node].flag_win);
-            MPI_Put(&next_buffer_index, sizeof(next_buffer_index), MPI_BYTE, node, sizeof(cache_line_buffer) * buffer_index, sizeof(next_buffer_index), MPI_BYTE, peers[node].flag_win);
+            MPI_Win_lock(MPI_LOCK_EXCLUSIVE, node, 0, peers[node].msg_flag_win);
+            // put msg
+			MPI_Put(msg, size, MPI_BYTE, node, sizeof(msg_flag_buffer) * buffer_index, size, MPI_BYTE, peers[node].msg_flag_win);
+			// put flag
+			MPI_Put(&next_buffer_index, sizeof(next_buffer_index), MPI_BYTE, node, sizeof(msg_flag_buffer) * buffer_index + constants::MSG_SIZE, sizeof(next_buffer_index), MPI_BYTE, peers[node].msg_flag_win);
+			// msg_put.add(t1);
+			MPI_Win_unlock(node, peers[node].msg_flag_win);
+			HAM_DEBUG( HAM_LOG << "communicator::send_msg(): wrote msg + flag" << std::endl; )
+
+			// unlock includes flush, no need for it here
+			// ham::util::time::timer t2;
+			// MPI_Win_flush(node, peers[node].msg_win);
+			// flush.add(t2);
+			// HAM_DEBUG( HAM_LOG << "communicator::send_msg(): flushed msg" << std::endl; )
+			// HAM_DEBUG( HAM_LOG << "communicator::send_msg(): flushing msg took: " << ""/*flush.min().count()*/ << std::endl; )
+
+			// write flag to target flags buffer
+			// not sure on the size here?
+			// ham::util::time::timer t3;
+			// MPI_Win_lock(MPI_LOCK_EXCLUSIVE, node, 0, peers[node].msg_flag_win);
             // flag_put.add(t3);
-            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): wrote flag" << std::endl; )
-            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): writing flag took: " << ""/*flag_put.min().count()*/ <<std::endl; )
-            MPI_Win_unlock(node, peers[node].flag_win);
+            // HAM_DEBUG( HAM_LOG << "communicator::send_msg(): wrote flag" << std::endl; )
+            // HAM_DEBUG( HAM_LOG << "communicator::send_msg(): writing flag took: " << ""/*flag_put.min().count()*/ <<std::endl; )
+            // MPI_Win_unlock(node, peers[node].msg_flag_win);
 
         } else { // to host, used by send_result
             // ham::util::time::statistics msg_put(1,0);
@@ -435,106 +434,82 @@ class communicator {
             size_t offset = constants::MSG_BUFFERS * this_node_;
             HAM_DEBUG( HAM_LOG << "communicator::send_msg(): using msg host-offset (bytes): " << offset*sizeof(msg_buffer) << std::endl; )
             // ham::util::time::timer t1;
-            MPI_Win_lock(MPI_LOCK_EXCLUSIVE, node, 0, peers[node].msg_win);
-            MPI_Put(msg, size, MPI_BYTE, node, sizeof(msg_buffer) * (offset + buffer_index), size, MPI_BYTE, peers[node].msg_win);
-            // msg_put.add(t1);
-            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): wrote msg" << std::endl; )
-            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): writing msg took: " << ""/*msg_put.min().count()*/ << std::endl; )
-            MPI_Win_unlock(node, peers[node].msg_win);
-
-            // ham::util::time::timer t2;
-            // MPI_Win_flush(node, peers[node].msg_win);
-            // flush.add(t2);
-            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): flushed msg" << std::endl; )
-            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): flushing msg took: " << ""/*flush.min().count()*/ << std::endl; )
-
-            // ham::util::time::timer t3;
-            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): using flag host-offset (bytes): " << offset*sizeof(cache_line_buffer) << std::endl; )
-            MPI_Win_lock(MPI_LOCK_EXCLUSIVE, node, 0, peers[node].flag_win);
-            MPI_Put(&next_buffer_index, sizeof(next_buffer_index), MPI_BYTE, node, sizeof(cache_line_buffer) * (offset + buffer_index), sizeof(next_buffer_index), MPI_BYTE, peers[node].flag_win);
+            MPI_Win_lock(MPI_LOCK_EXCLUSIVE, node, 0, peers[node].msg_flag_win);
+            // put msg/result
+			MPI_Put(msg, size, MPI_BYTE, node, sizeof(msg_flag_buffer) * (offset + buffer_index), size, MPI_BYTE, peers[node].msg_flag_win);
+            // put flag/result notification
+			MPI_Put(&next_buffer_index, sizeof(next_buffer_index), MPI_BYTE, node, sizeof(msg_flag_buffer) * (offset + buffer_index)  + constants::MSG_SIZE, sizeof(next_buffer_index), MPI_BYTE, peers[node].msg_flag_win);
+			MPI_Win_unlock(node, peers[node].msg_flag_win);
+			HAM_DEBUG( HAM_LOG << "communicator::send_msg(): wrote msg + flag" << std::endl; )
+			// HAM_DEBUG( HAM_LOG << "communicator::send_msg(): writing msg took: " << ""/*msg_put.min().count()*/ << std::endl; )
+
+			// ham::util::time::timer t2;
+			// MPI_Win_flush(node, peers[node].msg_win);
+			// flush.add(t2);
+			// HAM_DEBUG( HAM_LOG << "communicator::send_msg(): flushed msg" << std::endl; )
+			// HAM_DEBUG( HAM_LOG << "communicator::send_msg(): flushing msg took: " << ""/*flush.min().count()*/ << std::endl; )
+
+			// ham::util::time::timer t3;
+			// HAM_DEBUG( HAM_LOG << "communicator::send_msg(): using flag host-offset (bytes): " << offset*sizeof(cache_line_buffer) << std::endl; )
+			// MPI_Win_lock(MPI_LOCK_EXCLUSIVE, node, 0, peers[node].msg_flag_win);
             // flag_put.add(t3);
-            MPI_Win_unlock(node, peers[node].flag_win);
-            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): wrote flag" << std::endl; )
-            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): writing flag took: " << ""/*flag_put.min().count()*/ <<std::endl; )
+            // MPI_Win_unlock(node, peers[node].msg_flag_win);
+            // HAM_DEBUG( HAM_LOG << "communicator::send_msg(): wrote flag" << std::endl; )
+            // HAM_DEBUG( HAM_LOG << "communicator::send_msg(): writing flag took: " << ""/*flag_put.min().count()*/ <<std::endl; )
 
         }
     }
-
-	void send_msg(request_reference_type req, void* msg, size_t size)
-	{
-		/*
-        // copy message from caller into transfer buffer
-		void* msg_buffer = static_cast<void*>(&peers[req.target_node].msg_buffers[req.send_buffer_index]);
-		memcpy(msg_buffer, msg, size);
-		MPI_Isend(msg_buffer, size, MPI_BYTE, req.target_node, constants::DEFAULT_TAG, MPI_COMM_WORLD, &req.next_mpi_request());
-	    */
-
-        const request& next_req = allocate_next_request(req.target_node); // allocate_next_req needed??
+    // this is used by the host
+	void send_msg(request_reference_type req, void* msg, size_t size) {
+        const request& next_req = allocate_next_request(req.target_node); // this is only required for the host
         send_msg(req.target_node, req.remote_buffer_index, next_req.remote_buffer_index, msg, size);
     }
 
     // make private?!
     // called by function below
-    void* recv_msg(node_t node, size_t buffer_index = NO_BUFFER_INDEX, void* msg = nullptr, size_t size = constants::MSG_SIZE)
-    {
-        // ham::util::time::statistics pre_poll(1,0);
-        // ham::util::time::statistics poll(1,0);
-        // ham::util::time::timer t1;
-        buffer_index = buffer_index == NO_BUFFER_INDEX ? peers[node].next_flag : buffer_index;
-        HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): remote node is: " << node << std::endl; )
-		HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): using buffer index: " << buffer_index << std::endl; )
+    void* recv_msg(node_t node, size_t buffer_index = NO_BUFFER_INDEX, void* msg = nullptr, size_t size = constants::MSG_SIZE) {
+		buffer_index = buffer_index == NO_BUFFER_INDEX ? peers[node].next_flag : buffer_index;
+		HAM_DEBUG(HAM_LOG << "communicator::recv_msg(): remote node is: " << node << std::endl; )
+		HAM_DEBUG(HAM_LOG << "communicator::recv_msg(): using buffer index: " << buffer_index << std::endl; )
 
 
-        size_t *local_flag;
-        size_t received_flag = FLAG_FALSE;
+		// size_t *local_flag;
+		flag_t received_flag = FLAG_FALSE;
 
 		// needed on host to access the memory belonging to the node from which to receive
 		size_t offset = (this_node_ == host_node_) ? constants::MSG_BUFFERS * node : 0;
 
-        if (this_node_ == host_node_) {
-            local_flag = reinterpret_cast<size_t*>(&peers[host_node_].flag_data.get()[offset + buffer_index]);
-        } else {
-            local_flag = reinterpret_cast<size_t*>(&peers[this_node_].flag_data.get()[buffer_index]);
-        }
-
-
-        HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): FLAG before polling: " << (int)*local_flag << std::endl; )
-        // pre_poll.add(t1);
-        HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): pre-polling took: " << ""/*pre_poll.min().count()*/ << std::endl; )
-        // ham::util::time::timer t2;
-		HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): polling at offset (bytes): " << offset * sizeof(cache_line_buffer) << ""/*pre_poll.min().count()*/ << std::endl; )
-
-
-
-        while (received_flag == FLAG_FALSE) {
-            MPI_Win_lock(MPI_LOCK_EXCLUSIVE, this_node_, 0, peers[this_node_].flag_win);
-            MPI_Get(&received_flag, sizeof(cache_line_buffer), MPI_BYTE, this_node_, (offset + buffer_index) * sizeof(cache_line_buffer) , sizeof(cache_line_buffer), MPI_BYTE, peers[this_node_].flag_win);
-            MPI_Win_unlock(this_node_, peers[this_node_].flag_win);
-        } // poll on flag for completion
-        // poll.add(t2);
-        HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): FLAG after polling: " << (int)*local_flag << std::endl; )
-        HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): polling took: " << ""/*poll.min().count()*/ << std::endl; )
-
-        // reset the flag (thanks mpi for requiring me to get a lock for that again...
-        MPI_Win_lock(MPI_LOCK_EXCLUSIVE, this_node_, 0, peers[this_node_].flag_win);
-        *local_flag = FLAG_FALSE;
-        MPI_Win_unlock(this_node_, peers[this_node_].flag_win);
-
-        // make sure message window is updated locally too
-        MPI_Win_lock(MPI_LOCK_SHARED, this_node_, 0, peers[this_node_].msg_win);
-        MPI_Win_unlock(this_node_, peers[this_node_].msg_win);
-
-        if (received_flag != NO_BUFFER_INDEX) // the flag contains the next buffer index to poll on
-            peers[node].next_flag = received_flag;
+		HAM_DEBUG(HAM_LOG << "communicator::recv_msg(): FLAG before polling: " << peers[this_node_].msg_flag_data.get()[offset + buffer_index].flag << std::endl; )
+		HAM_DEBUG(HAM_LOG << "communicator::recv_msg(): polling at offset (bytes): "
+						  << (offset + buffer_index) * sizeof(msg_flag_buffer) << std::endl; )
+
+		while (received_flag == FLAG_FALSE) {
+			MPI_Win_lock(MPI_LOCK_EXCLUSIVE, this_node_, 0, peers[this_node_].msg_flag_win);
+			MPI_Get(&received_flag, sizeof(flag_t), MPI_BYTE, this_node_,
+					sizeof(msg_flag_buffer) * (offset + buffer_index) + constants::MSG_SIZE, sizeof(flag_t),
+					MPI_BYTE, peers[this_node_].msg_flag_win);
+			// using a get here, by standard just accessing the memory should be okay too, like below
+			// received_flag = peers[this_node_].msg_flag_data.get()[offset + buffer_index].flag);
+			MPI_Win_unlock(this_node_, peers[this_node_].msg_flag_win);
+		} // poll on flag for completion
+		HAM_DEBUG(HAM_LOG << "communicator::recv_msg(): FLAG after polling: "
+						  << peers[this_node_].msg_flag_data.get()[offset + buffer_index].flag << std::endl; )
+
+		// reset the flag
+		// this is weird: theoretically this reset does not need to be visible publicly, as no other process ever reads the flags -> could leave out the locking
+		// however, if in the separate model the local wincopy has an update that is not updated to the public copy, behaviour when the target locks again for polling is undefined
+		// might overwrite local change with old flag still valid in public window (which will undo resetting the flag and result in infinite re-execution of the last AM until host writes new flag to public win)
+		MPI_Win_lock(MPI_LOCK_EXCLUSIVE, this_node_, 0, peers[this_node_].msg_flag_win);
+		peers[this_node_].msg_flag_data.get()[offset + buffer_index].flag = FLAG_FALSE; // offset==0 for non-hosts
+		MPI_Win_unlock(this_node_, peers[this_node_].msg_flag_win);
+
+		if (received_flag != NO_BUFFER_INDEX) { // the flag contains the next buffer index to poll on
+			peers[node].next_flag = received_flag;
+		}
 
-        HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): done " << ""/*poll.min().count()*/ << std::endl; )
+        HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): done " << std::endl; )
 
-        if (this_node_ == host_node_) {
-            size_t offset = constants::MSG_BUFFERS * node;
-            return &peers[host_node_].msg_data.get()[offset + buffer_index];
-        } else {
-            return &peers[this_node_].msg_data.get()[buffer_index];
-        }
+        return &peers[this_node_].msg_flag_data.get()[offset + buffer_index]; // offset==0 for non-hosts
     }
 
 	// to be used by the offload target's main loop: synchronously receive one message at a time
@@ -544,45 +519,48 @@ class communicator {
 		/* static msg_buffer buffer; // NOTE !
 		MPI_Recv(&buffer, size, MPI_BYTE, host_node_, constants::DEFAULT_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
         return static_cast<void*>(&buffer); */
-        return recv_msg(host_node_, NO_BUFFER_INDEX, msg, size);
+        return static_cast<void*>(recv_msg(host_node_, NO_BUFFER_INDEX, msg, size));
 	}
 
-	// trigger receiving the result of a message on the sending side
+	// trigger asyncly receiving the result of a message on the sending side
 	void recv_result(request_reference_type req)
 	{
-		// nothing todo here, since this communicator implementation uses one-sided communication
+		// nothing to do here, since this communicator implementation uses one-sided communication
 		// the data will be written to where it is expected
-
-        // MPI_Irecv(static_cast<void*>(&peers[req.target_node].msg_buffers[req.recv_buffer_index]), constants::MSG_SIZE, MPI_BYTE, req.target_node, constants::RESULT_TAG, MPI_COMM_WORLD, &req.next_mpi_request());
-		return;
+		HAM_DEBUG( HAM_LOG << "communicator::recv_result(): This does nothing with the MPI RMA communicator" << std::endl; )
+        return;
 	}
 
-    // only used by the host
-    bool test_local_flag(node_t node, size_t buffer_index)
-    {
-        size_t offset = (constants::MSG_SIZE + constants::FLAG_SIZE) * node;
-        volatile size_t * local_flag = reinterpret_cast<size_t*>(&peers[node].flag_data.get()[buffer_index]);
-        return *local_flag != FLAG_FALSE;
+    // only used by the host through request.test() (top of this file) called by future.test() (offload.hpp)
+    bool test_local_flag(node_t node, size_t buffer_index) {
+		size_t offset = constants::MSG_BUFFERS * node;
+		flag_t temp_flag = FLAG_FALSE;
+		// public window flag changes may have not have been updated in local window... so we need to lock again here
+		MPI_Win_lock(MPI_LOCK_EXCLUSIVE, this_node_, 0, peers[this_node_].msg_flag_win);
+		temp_flag = peers[node].msg_flag_data.get()[offset + buffer_index].flag;
+		MPI_Win_unlock(this_node_, peers[this_node_].msg_flag_win);
+        return temp_flag != FLAG_FALSE;
     }
 
-    void reset_flags(buffer_ptr<cache_line_buffer> flags, size_t size)
-	{
-		cache_line_buffer fill_value;
-		cache_line_buffer* fill_value_ptr = &fill_value;
-		// null fill_value
-		std::fill(reinterpret_cast<unsigned char*>(fill_value_ptr), reinterpret_cast<unsigned char*>(fill_value_ptr) + sizeof(cache_line_buffer), 0);
-		// set to flag false
-		*reinterpret_cast<size_t*>(fill_value_ptr) = FLAG_FALSE;
-		// set all flags to fill_value
-		std::fill(flags.get(), flags.get() + size, fill_value);
+    void flush_data(node_t node) {
+        MPI_Win_flush(node, peers[node].rma_data_win);
+    }
+
+	// this is only called @ communicator construction to initialize flags with FLAG_FALSE
+	// calling this at any other point may reset flags belonging to messages that have not yet been executed (and never will be then)
+    void reset_flags(buffer_ptr<msg_flag_buffer> msg_flags, size_t size) {
+		// now this is where a struct of arrays would have been cooler...
+		// TODO - Daniel: Ask Matthias if he knows a cooler solution
+		for (int i = 0; i <= size ; ++i) {
+			msg_flags.get()[i].flag = FLAG_FALSE;
+		}
 	}
 
 	// in MPI RMA backend only used by copy
 	// host uses async version
 	// targets don't send data to host as host uses rma get
 	template<typename T>
-	void send_data(T* local_source, buffer_ptr<T> remote_dest, size_t size)
-	{
+	void send_data(T* local_source, buffer_ptr<T> remote_dest, size_t size) {
 		// execute transfer
         MPI_Put(local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), remote_dest.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_dest.node()].rma_data_win);
         MPI_Win_flush(remote_dest.node(), peers[remote_dest.node()].rma_data_win);
@@ -590,10 +568,8 @@ class communicator {
 
 	// to be used by the host only
 	template<typename T>
-	void send_data_async(request_reference_type req, T* local_source, buffer_ptr<T> remote_dest, size_t size)
-	{
-        req.uses_rma_ = true;
-
+	void send_data_async(request_reference_type req, T* local_source, buffer_ptr<T> remote_dest, size_t size) {
+        req.data_transfer_type = constants::DATA_PUT_CODE;
         MPI_Rput(local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), remote_dest.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_dest.node()].rma_data_win, &req.next_mpi_request());
 	}
 
@@ -610,20 +586,18 @@ class communicator {
 	
 	// to be used by the host
 	template<typename T>
-	void recv_data_async(request_reference_type req, buffer_ptr<T> remote_source, T* local_dest, size_t size)
-	{
-        req.uses_rma_ = true;
-
-		MPI_Rget(local_dest, size * sizeof(T), MPI_BYTE, remote_source.node(), remote_source.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_source.node()].rma_data_win, &req.next_mpi_request());
+	void recv_data_async(request_reference_type req, buffer_ptr<T> remote_source, T* local_dest, size_t size) {
+        req.data_transfer_type = constants::DATA_GET_CODE;
+        MPI_Rget(local_dest, size * sizeof(T), MPI_BYTE, remote_source.node(), remote_source.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_source.node()].rma_data_win, &req.next_mpi_request());
 	}
 
 	template<typename T>
 	buffer_ptr<T> allocate_buffer(const size_t n, node_t source_node)
 	{
 		T* ptr;
-		//int err =
-		posix_memalign((void**)&ptr, constants::CACHE_LINE_SIZE, n * sizeof(T));
 
+		// posix_memalign((void**)&ptr, constants::CACHE_LINE_SIZE, n * sizeof(T));
+		MPI_Alloc_mem(n * sizeof(T), MPI_INFO_NULL, &ptr);
         // attach to own window
         HAM_DEBUG( HAM_LOG << "communicator::allocate_buffer(), allocating buffer @: " << (long)ptr << std::endl; )
         MPI_Win_attach(peers[this_node_].rma_data_win, (void*)ptr, n * sizeof(T));
@@ -639,7 +613,9 @@ class communicator {
 	buffer_ptr<T> allocate_peer_buffer(const size_t n, node_t source_node)
 	{
         T* ptr;
-		posix_memalign((void**)&ptr, constants::CACHE_LINE_SIZE, n * sizeof(T));
+		// posix_memalign((void**)&ptr, constants::CACHE_LINE_SIZE, n * sizeof(T)); // if you revert to memalign, also change back free in free_peer_buffer()
+		// using MPI_Alloc instead as these buffers are used for RMA accesses
+		MPI_Alloc_mem(n * sizeof(T), MPI_INFO_NULL, &ptr);
 		// NOTE: no ctor is called
 		return buffer_ptr<T>(ptr, this_node_);
 	}
@@ -656,18 +632,19 @@ class communicator {
         /* for (node_t i = 1; i < nodes_; ++i) { // nonsense, all accesses to a rank will only take place on that targets window, no need to attach to other
             MPI_Win_detach(peers[i].rma_data_win, ptr.get());
         } */
-		free(static_cast<void*>(ptr.get()));
+		// free(static_cast<void*>(ptr.get())); // switch back to this if you revert back from using MPI_alloc_mem()
+		MPI_Free_mem(ptr.get());
 	}
 
     // for host to free peer message buffers, needed because original function now manages rma window which must not happen for host-only local buffers
 	template<typename T>
 	void free_peer_buffer(buffer_ptr<T> ptr)
 	{
-        // TODO DANIEL: this is where mem is freed that should be mapped to static mpi windows
-        // i dont think this is ever called on the actual memory mapped to static mpi windows, freeing it would equal "disconnecting" corresponding target
+        // this will never be called on the actual memory mapped to static mpi windows, freeing it would equal "disconnecting" the corresponding target
 		assert(ptr.node() == this_node_);
 		// NOTE: no dtor is called
-		free(static_cast<void*>(ptr.get()));
+		// free(static_cast<void*>(ptr.get())); // switch back to this if you revert back from using MPI_alloc_mem()
+		MPI_Free_mem(ptr.get());
 	}
 
 	static communicator& instance() { return *instance_; }
@@ -689,7 +666,8 @@ class communicator {
 	std::vector<node_descriptor> node_descriptions; // not as member in peer below, because Allgather is used to exchange node descriptions
 
     struct mpi_peer {
-		buffer_ptr<msg_buffer> msg_buffers; // buffers used for MPI_ISend and IRecv by the sender // buffers used for MPI_RPut and RGet
+
+		// buffer_ptr<msg_buffer> msg_buffers; // buffers used for MPI_ISend and IRecv by the sender // not needed for RMA version, host-side RMA window is used instead
 
 		// needed by sender to manage which buffers are in use and which are free
 		// just manages indices, that can be used by

From a7ad374b8cabd0a3a831e64ca91e9e9dc34ae7d4 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Sun, 28 Oct 2018 16:59:10 +0100
Subject: [PATCH 070/150] added CMake

---
 CMakeLists.txt                                |  48 ++
 src/CMakeLists.txt                            |  83 +++
 src/ham/CMakeLists.txt                        |  70 +++
 thirdparty/bmt/AUTHORS.md                     |   4 +
 thirdparty/bmt/CMakeLists.txt                 |  26 +
 thirdparty/bmt/LICENSE_1_0.txt                |  23 +
 thirdparty/bmt/README.md                      |  21 +
 thirdparty/bmt/build/CMakeCache.txt           | 278 +++++++++
 .../CMakeFiles/3.5.2/CMakeCXXCompiler.cmake   |  68 +++
 .../3.5.2/CMakeDetermineCompilerABI_CXX.bin   | Bin 0 -> 12552 bytes
 .../build/CMakeFiles/3.5.2/CMakeSystem.cmake  |  15 +
 .../CompilerIdCXX/CMakeCXXCompilerId.cpp      | 533 ++++++++++++++++++
 .../CMakeFiles/3.5.2/CompilerIdCXX/a.out      | Bin 0 -> 12704 bytes
 .../CMakeDirectoryInformation.cmake           |  16 +
 .../bmt/build/CMakeFiles/CMakeOutput.log      | 339 +++++++++++
 .../bmt/build/CMakeFiles/Makefile.cmake       |  95 ++++
 thirdparty/bmt/build/CMakeFiles/Makefile2     | 126 +++++
 .../build/CMakeFiles/TargetDirectories.txt    |   5 +
 .../bmt/build/CMakeFiles/cmake.check_cache    |   1 +
 .../bmt/build/CMakeFiles/feature_tests.bin    | Bin 0 -> 16600 bytes
 .../bmt/build/CMakeFiles/feature_tests.cxx    | 405 +++++++++++++
 .../bmt/build/CMakeFiles/progress.marks       |   1 +
 thirdparty/bmt/build/Makefile                 | 148 +++++
 thirdparty/bmt/build/cmake_install.cmake      |  50 ++
 thirdparty/bmt/build/example                  | Bin 0 -> 72600 bytes
 .../CMakeDirectoryInformation.cmake           |  16 +
 .../CMakeFiles/example.dir/CXX.includecache   |  36 ++
 .../CMakeFiles/example.dir/DependInfo.cmake   |  21 +
 .../src/CMakeFiles/example.dir/build.make     | 113 ++++
 .../CMakeFiles/example.dir/cmake_clean.cmake  |  10 +
 .../CMakeFiles/example.dir/depend.internal    |   6 +
 .../src/CMakeFiles/example.dir/depend.make    |   6 +
 .../src/CMakeFiles/example.dir/example.cpp.o  | Bin 0 -> 87416 bytes
 .../src/CMakeFiles/example.dir/flags.make     |  10 +
 .../build/src/CMakeFiles/example.dir/link.txt |   1 +
 .../src/CMakeFiles/example.dir/progress.make  |   3 +
 .../bmt/build/src/CMakeFiles/progress.marks   |   1 +
 thirdparty/bmt/build/src/Makefile             | 180 ++++++
 thirdparty/bmt/build/src/cmake_install.cmake  |  34 ++
 thirdparty/bmt/include/noma/bmt/bmt.hpp       | 257 +++++++++
 thirdparty/bmt/src/CMakeLists.txt             |  13 +
 thirdparty/bmt/src/example.cpp                |  58 ++
 42 files changed, 3120 insertions(+)
 create mode 100644 CMakeLists.txt
 create mode 100644 src/CMakeLists.txt
 create mode 100644 src/ham/CMakeLists.txt
 create mode 100644 thirdparty/bmt/AUTHORS.md
 create mode 100644 thirdparty/bmt/CMakeLists.txt
 create mode 100644 thirdparty/bmt/LICENSE_1_0.txt
 create mode 100644 thirdparty/bmt/README.md
 create mode 100644 thirdparty/bmt/build/CMakeCache.txt
 create mode 100644 thirdparty/bmt/build/CMakeFiles/3.5.2/CMakeCXXCompiler.cmake
 create mode 100755 thirdparty/bmt/build/CMakeFiles/3.5.2/CMakeDetermineCompilerABI_CXX.bin
 create mode 100644 thirdparty/bmt/build/CMakeFiles/3.5.2/CMakeSystem.cmake
 create mode 100644 thirdparty/bmt/build/CMakeFiles/3.5.2/CompilerIdCXX/CMakeCXXCompilerId.cpp
 create mode 100755 thirdparty/bmt/build/CMakeFiles/3.5.2/CompilerIdCXX/a.out
 create mode 100644 thirdparty/bmt/build/CMakeFiles/CMakeDirectoryInformation.cmake
 create mode 100644 thirdparty/bmt/build/CMakeFiles/CMakeOutput.log
 create mode 100644 thirdparty/bmt/build/CMakeFiles/Makefile.cmake
 create mode 100644 thirdparty/bmt/build/CMakeFiles/Makefile2
 create mode 100644 thirdparty/bmt/build/CMakeFiles/TargetDirectories.txt
 create mode 100644 thirdparty/bmt/build/CMakeFiles/cmake.check_cache
 create mode 100755 thirdparty/bmt/build/CMakeFiles/feature_tests.bin
 create mode 100644 thirdparty/bmt/build/CMakeFiles/feature_tests.cxx
 create mode 100644 thirdparty/bmt/build/CMakeFiles/progress.marks
 create mode 100644 thirdparty/bmt/build/Makefile
 create mode 100644 thirdparty/bmt/build/cmake_install.cmake
 create mode 100755 thirdparty/bmt/build/example
 create mode 100644 thirdparty/bmt/build/src/CMakeFiles/CMakeDirectoryInformation.cmake
 create mode 100644 thirdparty/bmt/build/src/CMakeFiles/example.dir/CXX.includecache
 create mode 100644 thirdparty/bmt/build/src/CMakeFiles/example.dir/DependInfo.cmake
 create mode 100644 thirdparty/bmt/build/src/CMakeFiles/example.dir/build.make
 create mode 100644 thirdparty/bmt/build/src/CMakeFiles/example.dir/cmake_clean.cmake
 create mode 100644 thirdparty/bmt/build/src/CMakeFiles/example.dir/depend.internal
 create mode 100644 thirdparty/bmt/build/src/CMakeFiles/example.dir/depend.make
 create mode 100644 thirdparty/bmt/build/src/CMakeFiles/example.dir/example.cpp.o
 create mode 100644 thirdparty/bmt/build/src/CMakeFiles/example.dir/flags.make
 create mode 100644 thirdparty/bmt/build/src/CMakeFiles/example.dir/link.txt
 create mode 100644 thirdparty/bmt/build/src/CMakeFiles/example.dir/progress.make
 create mode 100644 thirdparty/bmt/build/src/CMakeFiles/progress.marks
 create mode 100644 thirdparty/bmt/build/src/Makefile
 create mode 100644 thirdparty/bmt/build/src/cmake_install.cmake
 create mode 100644 thirdparty/bmt/include/noma/bmt/bmt.hpp
 create mode 100644 thirdparty/bmt/src/CMakeLists.txt
 create mode 100644 thirdparty/bmt/src/example.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..cf05180
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,48 @@
+# Copyright (c) 2018 Marcel Ehrhardt <marcel.ehrhardt@fu-berlin.de>
+# Copyright (c) 2018 Matthias Noack <ma.noack.pr@gmail.com>
+#
+# See accompanying file LICENSE and README for further information.
+
+project(ham LANGUAGES CXX)
+cmake_minimum_required(VERSION 3.2 FATAL_ERROR) # TODO verfify
+
+# set output directory
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+message(STATUS "CMAKE_BINARY_DIR: " ${CMAKE_BINARY_DIR})
+
+### thirdparty dependencies
+
+# Boost
+find_package(Boost 1.40 COMPONENTS program_options REQUIRED)
+add_library(boost_library INTERFACE)
+target_include_directories (boost_library INTERFACE ${Boost_INCLUDE_DIRS})
+target_link_libraries (boost_library INTERFACE ${Boost_LIBRARIES})
+
+# MPI
+find_package(MPI) # not required
+if (MPI_FOUND)
+	add_library (mpi_library INTERFACE)
+	target_include_directories (mpi_library INTERFACE ${MPI_CXX_INCLUDE_PATH})
+	target_compile_options (mpi_library INTERFACE ${MPI_CXX_COMPILE_FLAGS})
+	target_link_libraries (mpi_library INTERFACE ${MPI_CXX_LIBRARIES})
+endif ()
+
+# Intel SCIF (for Xeon Phi accelerators with KNC architecture)
+find_file(SCIF_HEADER_FILE "scif.h")
+if (SCIF_HEADER_FILE)
+	set(SCIF_FOUND ON)
+	get_filename_component(SCIF_INCLUDE_DIR "${SCIF_HEADER_FILE}" DIRECTORY)
+	message(STATUS "Found SCIF: ${SCIF_HEADER_FILE}")
+
+	add_library (scif_library INTERFACE)
+	target_include_directories (scif_library INTERFACE ${SCIF_INCLUDE_DIR})
+else ()
+	message(STATUS "Could NOT find SCIF (missing: scif.h)")
+endif ()
+
+# tell the compiler to be strict
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -hstd=c++11")
+set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DHAM_DEBUG_ON")
+
+add_subdirectory(thirdparty/bmt ${CMAKE_CURRENT_BINARY_DIR}/build.noma_bmt)
+add_subdirectory(src)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 0000000..b34c36e
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,83 @@
+# Copyright (c) 2018 Marcel Ehrhardt <marcel.ehrhardt@fu-berlin.de>
+# Copyright (c) 2018 Matthias Noack <ma.noack.pr@gmail.com>
+#
+# See accompanying file LICENSE and README for further information.
+
+project(ham_exe LANGUAGES CXX)
+cmake_minimum_required(VERSION 3.2 FATAL_ERROR) # TODO verfify
+
+add_subdirectory(ham)
+
+### Benchmarks
+
+## Explicit targets (not built by default)
+# Intel LEO offload directive benchmark, requires Intel compiler
+if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
+	add_executable(benchmark_intel_leo EXCLUDE_FROM_ALL benchmark_intel_leo.cpp)
+	target_link_libraries(benchmark_intel_leo ham_interface)
+endif ()
+
+if (MPI_FOUND)
+	add_executable (benchmark_ham_offload_mpi benchmark_ham_offload.cpp)
+	target_link_libraries (benchmark_ham_offload_mpi ham_offload_mpi)
+
+	add_executable (benchmark_ham_offload_mpi_rma_dynamic benchmark_ham_offload.cpp)
+	target_link_libraries (benchmark_ham_offload_mpi_rma_dynamic ham_offload_mpi_rma_dynamic)
+endif()
+
+if (SCIF_FOUND)
+	add_executable (benchmark_ham_offload_scif benchmark_ham_offload.cpp)
+	target_link_libraries (benchmark_ham_offload_scif ham_offload_scif)
+endif()
+
+### Examples/Tests
+
+# some tests for the active msg layer
+add_executable(active_msgs active_msgs.cpp)
+target_link_libraries(active_msgs ham_interface)
+
+if (MPI_FOUND)
+	add_executable(ham_offload_test_mpi ham_offload.cpp)
+	target_link_libraries(ham_offload_test_mpi ham_offload_mpi)
+
+	add_executable(ham_offload_test_explicit_mpi ham_offload_explicit.cpp)
+	target_link_libraries(ham_offload_test_explicit_mpi ham_offload_mpi_explicit)
+
+	add_executable(inner_product_mpi inner_product.cpp)
+	target_link_libraries(inner_product_mpi ham_offload_mpi)
+
+	add_executable(test_data_transfer_mpi test_data_transfer.cpp)
+	target_link_libraries(test_data_transfer_mpi ham_offload_mpi)
+
+	add_executable(test_argument_transfer_mpi test_argument_transfer.cpp)
+	target_link_libraries(test_argument_transfer_mpi ham_offload_mpi)
+
+	add_executable(ham_offload_test_mpi_rma_dynamic ham_offload.cpp)
+	target_link_libraries(ham_offload_test_mpi_rma_dynamic ham_offload_mpi_rma_dynamic)
+
+	add_executable(inner_product_mpi_rma_dynamic inner_product.cpp)
+	target_link_libraries(inner_product_mpi_rma_dynamic ham_offload_mpi_rma_dynamic)
+
+	add_executable(test_data_transfer_mpi_rma_dynamic test_data_transfer.cpp)
+	target_link_libraries(test_data_transfer_mpi_rma_dynamic ham_offload_mpi_rma_dynamic)
+
+	add_executable(test_argument_transfer_mpi_rma_dynamic test_argument_transfer.cpp)
+	target_link_libraries(test_argument_transfer_mpi_rma_dynamic ham_offload_mpi_rma_dynamic)
+endif()
+
+if (SCIF_FOUND)
+	add_executable(ham_offload_test_scif ham_offload.cpp)
+	target_link_libraries(ham_offload_test_scif ham_offload_scif)
+
+	add_executable(ham_offload_test_explicit_scif ham_offload_explicit.cpp)
+	target_link_libraries(ham_offload_test_explicit_scif ham_offload_scif_explicit)
+
+	add_executable(inner_product_scif inner_product.cpp)
+	target_link_libraries(inner_product_scif ham_offload_scif)
+
+	add_executable(test_data_transfer_scif test_data_transfer.cpp)
+	target_link_libraries(test_data_transfer_scif ham_offload_scif)
+
+	add_executable(test_argument_transfer_scif test_argument_transfer.cpp)
+	target_link_libraries(test_argument_transfer_scif ham_offload_scif)
+endif()
diff --git a/src/ham/CMakeLists.txt b/src/ham/CMakeLists.txt
new file mode 100644
index 0000000..278d452
--- /dev/null
+++ b/src/ham/CMakeLists.txt
@@ -0,0 +1,70 @@
+# Copyright (c) 2018 Marcel Ehrhardt <marcel.ehrhardt@fu-berlin.de>
+# Copyright (c) 2018 Matthias Noack <ma.noack.pr@gmail.com>
+#
+# See accompanying file LICENSE and README for further information.
+
+project(ham_lib LANGUAGES CXX)
+cmake_minimum_required(VERSION 3.2 FATAL_ERROR) # TODO verfify
+
+# interface target for ham
+add_library(ham_interface INTERFACE)
+target_compile_features(ham_interface INTERFACE cxx_auto_type cxx_range_for cxx_variadic_templates)
+target_link_libraries(ham_interface INTERFACE noma_bmt boost_library)
+target_include_directories(ham_interface INTERFACE ${CMAKE_CURRENT_LIST_DIR}/../../include)
+
+set(HAM_LIB_SRC
+	net/communicator.cpp
+	net/communicator_mpi.cpp
+	net/communicator_mpi_rma_dynamic.cpp
+	offload/runtime.cpp
+	offload/offload.cpp
+	util/cpu_affinity.cpp)
+
+if (MPI_FOUND)
+	add_library(ham_offload_mpi # SHARED if BUILD_SHARED_LIBS = TRUE
+	            ${HAM_LIB_SRC}
+	            offload/main.cpp
+	            net/communicator_mpi.cpp)
+	target_compile_definitions(ham_offload_mpi PUBLIC -DHAM_COMM_MPI=1)
+	target_link_libraries(ham_offload_mpi PUBLIC ham_interface mpi_library)
+
+	add_library(ham_offload_mpi_explicit # SHARED if BUILD_SHARED_LIBS = TRUE
+	            ${HAM_LIB_SRC}
+	            offload/main_explicit.cpp
+	            net/communicator_mpi.cpp)
+	target_compile_definitions(ham_offload_mpi_explicit PUBLIC -DHAM_COMM_MPI=1 -DHAM_EXPLICIT=1)
+	target_link_libraries(ham_offload_mpi_explicit PUBLIC ham_interface mpi_library)
+
+	add_library(ham_offload_mpi_rma_dynamic # SHARED if BUILD_SHARED_LIBS = TRUE
+	            ${HAM_LIB_SRC}
+	            offload/main.cpp
+	            net/communicator_mpi_rma_dynamic.cpp)
+	target_compile_definitions(ham_offload_mpi_rma_dynamic PUBLIC -DHAM_COMM_MPI_RMA_DYNAMIC=1)
+	target_link_libraries(ham_offload_mpi_rma_dynamic PUBLIC ham_interface mpi_library)
+
+	set_target_properties(ham_offload_mpi ham_offload_mpi_explicit ham_offload_mpi_rma_dynamic PROPERTIES
+		CXX_STANDARD 11
+		CXX_STANDARD_REQUIRED YES
+		CXX_EXTENSIONS NO)
+endif ()
+
+if (SCIF_FOUND)
+	add_library(ham_offload_scif # SHARED if BUILD_SHARED_LIBS = TRUE
+	            ${HAM_LIB_SRC}
+	            offload/main.cpp
+	            net/communicator_scif.cpp)
+	target_compile_definitions(ham_offload_scif PUBLIC -DHAM_COMM_SCIF=1)
+	target_link_libraries(ham_offload_scif_explicit PUBLIC ham_interface scif_library)
+
+	add_library(ham_offload_scif_explicit # SHARED if BUILD_SHARED_LIBS = TRUE
+	            ${HAM_LIB_SRC}
+	            offload/main_explicit.cpp
+	            net/communicator_scif.cpp)
+	target_compile_definitions(ham_offload_scif_explicit PUBLIC -DHAM_COMM_SCIF=1 -DHAM_EXPLICIT=1)
+	target_link_libraries(ham_offload_scif_explicit PUBLIC ham_interface scif_library)
+
+	set_target_properties(ham_offload_scif ham_offload_scif_explicit PROPERTIES
+		CXX_STANDARD 11
+		CXX_STANDARD_REQUIRED YES
+		CXX_EXTENSIONS NO)
+endif ()
diff --git a/thirdparty/bmt/AUTHORS.md b/thirdparty/bmt/AUTHORS.md
new file mode 100644
index 0000000..96e8fa0
--- /dev/null
+++ b/thirdparty/bmt/AUTHORS.md
@@ -0,0 +1,4 @@
+# Original Author
+
+Matthias Noack <ma.noack.pr@gmail.com>
+
diff --git a/thirdparty/bmt/CMakeLists.txt b/thirdparty/bmt/CMakeLists.txt
new file mode 100644
index 0000000..464c511
--- /dev/null
+++ b/thirdparty/bmt/CMakeLists.txt
@@ -0,0 +1,26 @@
+# Copyright (c) 2017 Matthias Noack <ma.noack.pr@gmail.com>
+#
+# See accompanying file LICENSE and README for further information.
+
+cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
+
+project(libnoma_bmt LANGUAGES CXX)
+
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+# header only library 
+add_library(noma_bmt INTERFACE)
+# NOTE: we want to use '#include "noma/bmt/bmt.hpp"', not '#include "bmt.hpp"'
+target_include_directories(noma_bmt INTERFACE include) 
+target_compile_features(noma_bmt INTERFACE )
+
+#set_target_properties(noma_bmt PROPERTIES
+#    CXX_STANDARD 11
+#    CXX_STANDARD_REQUIRED YES
+#    CXX_EXTENSIONS NO
+#)
+
+add_subdirectory(src)
diff --git a/thirdparty/bmt/LICENSE_1_0.txt b/thirdparty/bmt/LICENSE_1_0.txt
new file mode 100644
index 0000000..36b7cd9
--- /dev/null
+++ b/thirdparty/bmt/LICENSE_1_0.txt
@@ -0,0 +1,23 @@
+Boost Software License - Version 1.0 - August 17th, 2003
+
+Permission is hereby granted, free of charge, to any person or organization
+obtaining a copy of the software and accompanying documentation covered by
+this license (the "Software") to use, reproduce, display, distribute,
+execute, and transmit the Software, and to prepare derivative works of the
+Software, and to permit third-parties to whom the Software is furnished to
+do so, all subject to the following:
+
+The copyright notices in the Software and this entire statement, including
+the above license grant, this restriction and the following disclaimer,
+must be included in all copies of the Software, in whole or in part, and
+all derivative works of the Software, unless such copies or derivative
+works are solely in the form of machine-executable object code generated by
+a source language processor.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/thirdparty/bmt/README.md b/thirdparty/bmt/README.md
new file mode 100644
index 0000000..1c53432
--- /dev/null
+++ b/thirdparty/bmt/README.md
@@ -0,0 +1,21 @@
+# Benchmark Timer Library
+
+A simple C++11 header-only library that provides a `timer` and a `statistics` class for benchmarking.
+
+See `src/example.cpp` for usage.
+
+## Building and Running the example
+
+Building:
+
+```bash
+mkdir build
+cd build
+cmake -DNOMA_BMT_BUILD_EXAMPLES=TRUE ..
+make
+```
+
+```bash
+./example
+```
+
diff --git a/thirdparty/bmt/build/CMakeCache.txt b/thirdparty/bmt/build/CMakeCache.txt
new file mode 100644
index 0000000..9b797af
--- /dev/null
+++ b/thirdparty/bmt/build/CMakeCache.txt
@@ -0,0 +1,278 @@
+# This is the CMakeCache file.
+# For build in directory: /home/bemdeppi/ham/thirdparty/bmt/build
+# It was generated by CMake: /usr/bin/cmake
+# You can edit this file to change values found and used by cmake.
+# If you do not want to change any of the values, simply exit the editor.
+# If you do want to change a value, simply edit, save, and exit the editor.
+# The syntax for the file is as follows:
+# KEY:TYPE=VALUE
+# KEY is the name of a variable in the cache.
+# TYPE is a hint to GUIs for the type of VALUE, DO NOT EDIT TYPE!.
+# VALUE is the current value for the KEY.
+
+########################
+# EXTERNAL cache entries
+########################
+
+//Path to a program.
+CMAKE_AR:FILEPATH=/usr/bin/ar
+
+//Choose the type of build, options are: None(CMAKE_CXX_FLAGS or
+// CMAKE_C_FLAGS used) Debug Release RelWithDebInfo MinSizeRel.
+CMAKE_BUILD_TYPE:STRING=
+
+//Enable/Disable color output during build.
+CMAKE_COLOR_MAKEFILE:BOOL=ON
+
+//CXX compiler
+CMAKE_CXX_COMPILER:FILEPATH=/usr/bin/c++
+
+//Flags used by the compiler during all build types.
+CMAKE_CXX_FLAGS:STRING=
+
+//Flags used by the compiler during debug builds.
+CMAKE_CXX_FLAGS_DEBUG:STRING=-g
+
+//Flags used by the compiler during release builds for minimum
+// size.
+CMAKE_CXX_FLAGS_MINSIZEREL:STRING=-Os -DNDEBUG
+
+//Flags used by the compiler during release builds.
+CMAKE_CXX_FLAGS_RELEASE:STRING=-O3 -DNDEBUG
+
+//Flags used by the compiler during release builds with debug info.
+CMAKE_CXX_FLAGS_RELWITHDEBINFO:STRING=-O2 -g -DNDEBUG
+
+//Flags used by the linker.
+CMAKE_EXE_LINKER_FLAGS:STRING=
+
+//Flags used by the linker during debug builds.
+CMAKE_EXE_LINKER_FLAGS_DEBUG:STRING=
+
+//Flags used by the linker during release minsize builds.
+CMAKE_EXE_LINKER_FLAGS_MINSIZEREL:STRING=
+
+//Flags used by the linker during release builds.
+CMAKE_EXE_LINKER_FLAGS_RELEASE:STRING=
+
+//Flags used by the linker during Release with Debug Info builds.
+CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO:STRING=
+
+//Enable/Disable output of compile commands during generation.
+CMAKE_EXPORT_COMPILE_COMMANDS:BOOL=OFF
+
+//Install path prefix, prepended onto install directories.
+CMAKE_INSTALL_PREFIX:PATH=/usr/local
+
+//Path to a program.
+CMAKE_LINKER:FILEPATH=/usr/bin/ld
+
+//Path to a program.
+CMAKE_MAKE_PROGRAM:FILEPATH=/usr/bin/gmake
+
+//Flags used by the linker during the creation of modules.
+CMAKE_MODULE_LINKER_FLAGS:STRING=
+
+//Flags used by the linker during debug builds.
+CMAKE_MODULE_LINKER_FLAGS_DEBUG:STRING=
+
+//Flags used by the linker during release minsize builds.
+CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL:STRING=
+
+//Flags used by the linker during release builds.
+CMAKE_MODULE_LINKER_FLAGS_RELEASE:STRING=
+
+//Flags used by the linker during Release with Debug Info builds.
+CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO:STRING=
+
+//Path to a program.
+CMAKE_NM:FILEPATH=/usr/bin/nm
+
+//Path to a program.
+CMAKE_OBJCOPY:FILEPATH=/usr/bin/objcopy
+
+//Path to a program.
+CMAKE_OBJDUMP:FILEPATH=/usr/bin/objdump
+
+//Value Computed by CMake
+CMAKE_PROJECT_NAME:STATIC=libnoma_bmt
+
+//Path to a program.
+CMAKE_RANLIB:FILEPATH=/usr/bin/ranlib
+
+//Flags used by the linker during the creation of dll's.
+CMAKE_SHARED_LINKER_FLAGS:STRING=
+
+//Flags used by the linker during debug builds.
+CMAKE_SHARED_LINKER_FLAGS_DEBUG:STRING=
+
+//Flags used by the linker during release minsize builds.
+CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL:STRING=
+
+//Flags used by the linker during release builds.
+CMAKE_SHARED_LINKER_FLAGS_RELEASE:STRING=
+
+//Flags used by the linker during Release with Debug Info builds.
+CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO:STRING=
+
+//If set, runtime paths are not added when installing shared libraries,
+// but are added when building.
+CMAKE_SKIP_INSTALL_RPATH:BOOL=NO
+
+//If set, runtime paths are not added when using shared libraries.
+CMAKE_SKIP_RPATH:BOOL=NO
+
+//Flags used by the linker during the creation of static libraries.
+CMAKE_STATIC_LINKER_FLAGS:STRING=
+
+//Flags used by the linker during debug builds.
+CMAKE_STATIC_LINKER_FLAGS_DEBUG:STRING=
+
+//Flags used by the linker during release minsize builds.
+CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL:STRING=
+
+//Flags used by the linker during release builds.
+CMAKE_STATIC_LINKER_FLAGS_RELEASE:STRING=
+
+//Flags used by the linker during Release with Debug Info builds.
+CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO:STRING=
+
+//Path to a program.
+CMAKE_STRIP:FILEPATH=/usr/bin/strip
+
+//If this value is on, makefiles will be generated without the
+// .SILENT directive, and all commands will be echoed to the console
+// during the make.  This is useful for debugging only. With Visual
+// Studio IDE projects all commands are done without /nologo.
+CMAKE_VERBOSE_MAKEFILE:BOOL=FALSE
+
+//No help, variable specified on the command line.
+NOMA_BMT_BUILD_EXAMPLES:UNINITIALIZED=TRUE
+
+//Value Computed by CMake
+libnoma_bmt_BINARY_DIR:STATIC=/home/bemdeppi/ham/thirdparty/bmt/build
+
+//Value Computed by CMake
+libnoma_bmt_SOURCE_DIR:STATIC=/home/bemdeppi/ham/thirdparty/bmt
+
+
+########################
+# INTERNAL cache entries
+########################
+
+//ADVANCED property for variable: CMAKE_AR
+CMAKE_AR-ADVANCED:INTERNAL=1
+//This is the directory where this CMakeCache.txt was created
+CMAKE_CACHEFILE_DIR:INTERNAL=/home/bemdeppi/ham/thirdparty/bmt/build
+//Major version of cmake used to create the current loaded cache
+CMAKE_CACHE_MAJOR_VERSION:INTERNAL=3
+//Minor version of cmake used to create the current loaded cache
+CMAKE_CACHE_MINOR_VERSION:INTERNAL=5
+//Patch version of cmake used to create the current loaded cache
+CMAKE_CACHE_PATCH_VERSION:INTERNAL=2
+//ADVANCED property for variable: CMAKE_COLOR_MAKEFILE
+CMAKE_COLOR_MAKEFILE-ADVANCED:INTERNAL=1
+//Path to CMake executable.
+CMAKE_COMMAND:INTERNAL=/usr/bin/cmake
+//Path to cpack program executable.
+CMAKE_CPACK_COMMAND:INTERNAL=/usr/bin/cpack
+//Path to ctest program executable.
+CMAKE_CTEST_COMMAND:INTERNAL=/usr/bin/ctest
+//ADVANCED property for variable: CMAKE_CXX_COMPILER
+CMAKE_CXX_COMPILER-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_CXX_FLAGS
+CMAKE_CXX_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_CXX_FLAGS_DEBUG
+CMAKE_CXX_FLAGS_DEBUG-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_CXX_FLAGS_MINSIZEREL
+CMAKE_CXX_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_CXX_FLAGS_RELEASE
+CMAKE_CXX_FLAGS_RELEASE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_CXX_FLAGS_RELWITHDEBINFO
+CMAKE_CXX_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
+//Path to cache edit program executable.
+CMAKE_EDIT_COMMAND:INTERNAL=/usr/bin/ccmake
+//Executable file format
+CMAKE_EXECUTABLE_FORMAT:INTERNAL=ELF
+//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS
+CMAKE_EXE_LINKER_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_DEBUG
+CMAKE_EXE_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_MINSIZEREL
+CMAKE_EXE_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_RELEASE
+CMAKE_EXE_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO
+CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_EXPORT_COMPILE_COMMANDS
+CMAKE_EXPORT_COMPILE_COMMANDS-ADVANCED:INTERNAL=1
+//Name of external makefile project generator.
+CMAKE_EXTRA_GENERATOR:INTERNAL=
+//Name of generator.
+CMAKE_GENERATOR:INTERNAL=Unix Makefiles
+//Name of generator platform.
+CMAKE_GENERATOR_PLATFORM:INTERNAL=
+//Name of generator toolset.
+CMAKE_GENERATOR_TOOLSET:INTERNAL=
+//Source directory with the top level CMakeLists.txt file for this
+// project
+CMAKE_HOME_DIRECTORY:INTERNAL=/home/bemdeppi/ham/thirdparty/bmt
+//Install .so files without execute permission.
+CMAKE_INSTALL_SO_NO_EXE:INTERNAL=0
+//ADVANCED property for variable: CMAKE_LINKER
+CMAKE_LINKER-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_MAKE_PROGRAM
+CMAKE_MAKE_PROGRAM-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS
+CMAKE_MODULE_LINKER_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_DEBUG
+CMAKE_MODULE_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL
+CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_RELEASE
+CMAKE_MODULE_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO
+CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_NM
+CMAKE_NM-ADVANCED:INTERNAL=1
+//number of local generators
+CMAKE_NUMBER_OF_MAKEFILES:INTERNAL=2
+//ADVANCED property for variable: CMAKE_OBJCOPY
+CMAKE_OBJCOPY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_OBJDUMP
+CMAKE_OBJDUMP-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_RANLIB
+CMAKE_RANLIB-ADVANCED:INTERNAL=1
+//Path to CMake installation.
+CMAKE_ROOT:INTERNAL=/usr/share/cmake
+//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS
+CMAKE_SHARED_LINKER_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_DEBUG
+CMAKE_SHARED_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL
+CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_RELEASE
+CMAKE_SHARED_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO
+CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_SKIP_INSTALL_RPATH
+CMAKE_SKIP_INSTALL_RPATH-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_SKIP_RPATH
+CMAKE_SKIP_RPATH-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS
+CMAKE_STATIC_LINKER_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_DEBUG
+CMAKE_STATIC_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL
+CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_RELEASE
+CMAKE_STATIC_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO
+CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_STRIP
+CMAKE_STRIP-ADVANCED:INTERNAL=1
+//uname command
+CMAKE_UNAME:INTERNAL=/usr/bin/uname
+//ADVANCED property for variable: CMAKE_VERBOSE_MAKEFILE
+CMAKE_VERBOSE_MAKEFILE-ADVANCED:INTERNAL=1
+
diff --git a/thirdparty/bmt/build/CMakeFiles/3.5.2/CMakeCXXCompiler.cmake b/thirdparty/bmt/build/CMakeFiles/3.5.2/CMakeCXXCompiler.cmake
new file mode 100644
index 0000000..eadb4d9
--- /dev/null
+++ b/thirdparty/bmt/build/CMakeFiles/3.5.2/CMakeCXXCompiler.cmake
@@ -0,0 +1,68 @@
+set(CMAKE_CXX_COMPILER "/usr/bin/c++")
+set(CMAKE_CXX_COMPILER_ARG1 "")
+set(CMAKE_CXX_COMPILER_ID "GNU")
+set(CMAKE_CXX_COMPILER_VERSION "4.8.5")
+set(CMAKE_CXX_COMPILER_WRAPPER "")
+set(CMAKE_CXX_STANDARD_COMPUTED_DEFAULT "98")
+set(CMAKE_CXX_COMPILE_FEATURES "cxx_template_template_parameters;cxx_alias_templates;cxx_alignas;cxx_alignof;cxx_attributes;cxx_auto_type;cxx_constexpr;cxx_decltype;cxx_decltype_incomplete_return_types;cxx_default_function_template_args;cxx_defaulted_functions;cxx_defaulted_move_initializers;cxx_delegating_constructors;cxx_deleted_functions;cxx_enum_forward_declarations;cxx_explicit_conversions;cxx_extended_friend_declarations;cxx_extern_templates;cxx_final;cxx_func_identifier;cxx_generalized_initializers;cxx_inheriting_constructors;cxx_inline_namespaces;cxx_lambdas;cxx_local_type_template_args;cxx_long_long_type;cxx_noexcept;cxx_nonstatic_member_init;cxx_nullptr;cxx_override;cxx_range_for;cxx_raw_string_literals;cxx_reference_qualified_functions;cxx_right_angle_brackets;cxx_rvalue_references;cxx_sizeof_member;cxx_static_assert;cxx_strong_enums;cxx_thread_local;cxx_trailing_return_types;cxx_unicode_literals;cxx_uniform_initialization;cxx_unrestricted_unions;cxx_user_literals;cxx_variadic_macros;cxx_variadic_templates")
+set(CMAKE_CXX98_COMPILE_FEATURES "cxx_template_template_parameters")
+set(CMAKE_CXX11_COMPILE_FEATURES "cxx_alias_templates;cxx_alignas;cxx_alignof;cxx_attributes;cxx_auto_type;cxx_constexpr;cxx_decltype;cxx_decltype_incomplete_return_types;cxx_default_function_template_args;cxx_defaulted_functions;cxx_defaulted_move_initializers;cxx_delegating_constructors;cxx_deleted_functions;cxx_enum_forward_declarations;cxx_explicit_conversions;cxx_extended_friend_declarations;cxx_extern_templates;cxx_final;cxx_func_identifier;cxx_generalized_initializers;cxx_inheriting_constructors;cxx_inline_namespaces;cxx_lambdas;cxx_local_type_template_args;cxx_long_long_type;cxx_noexcept;cxx_nonstatic_member_init;cxx_nullptr;cxx_override;cxx_range_for;cxx_raw_string_literals;cxx_reference_qualified_functions;cxx_right_angle_brackets;cxx_rvalue_references;cxx_sizeof_member;cxx_static_assert;cxx_strong_enums;cxx_thread_local;cxx_trailing_return_types;cxx_unicode_literals;cxx_uniform_initialization;cxx_unrestricted_unions;cxx_user_literals;cxx_variadic_macros;cxx_variadic_templates")
+set(CMAKE_CXX14_COMPILE_FEATURES "")
+
+set(CMAKE_CXX_PLATFORM_ID "Linux")
+set(CMAKE_CXX_SIMULATE_ID "")
+set(CMAKE_CXX_SIMULATE_VERSION "")
+
+set(CMAKE_AR "/usr/bin/ar")
+set(CMAKE_RANLIB "/usr/bin/ranlib")
+set(CMAKE_LINKER "/usr/bin/ld")
+set(CMAKE_COMPILER_IS_GNUCXX 1)
+set(CMAKE_CXX_COMPILER_LOADED 1)
+set(CMAKE_CXX_COMPILER_WORKS TRUE)
+set(CMAKE_CXX_ABI_COMPILED TRUE)
+set(CMAKE_COMPILER_IS_MINGW )
+set(CMAKE_COMPILER_IS_CYGWIN )
+if(CMAKE_COMPILER_IS_CYGWIN)
+  set(CYGWIN 1)
+  set(UNIX 1)
+endif()
+
+set(CMAKE_CXX_COMPILER_ENV_VAR "CXX")
+
+if(CMAKE_COMPILER_IS_MINGW)
+  set(MINGW 1)
+endif()
+set(CMAKE_CXX_COMPILER_ID_RUN 1)
+set(CMAKE_CXX_IGNORE_EXTENSIONS inl;h;hpp;HPP;H;o;O;obj;OBJ;def;DEF;rc;RC)
+set(CMAKE_CXX_SOURCE_FILE_EXTENSIONS C;M;c++;cc;cpp;cxx;mm;CPP)
+set(CMAKE_CXX_LINKER_PREFERENCE 30)
+set(CMAKE_CXX_LINKER_PREFERENCE_PROPAGATES 1)
+
+# Save compiler ABI information.
+set(CMAKE_CXX_SIZEOF_DATA_PTR "8")
+set(CMAKE_CXX_COMPILER_ABI "ELF")
+set(CMAKE_CXX_LIBRARY_ARCHITECTURE "")
+
+if(CMAKE_CXX_SIZEOF_DATA_PTR)
+  set(CMAKE_SIZEOF_VOID_P "${CMAKE_CXX_SIZEOF_DATA_PTR}")
+endif()
+
+if(CMAKE_CXX_COMPILER_ABI)
+  set(CMAKE_INTERNAL_PLATFORM_ABI "${CMAKE_CXX_COMPILER_ABI}")
+endif()
+
+if(CMAKE_CXX_LIBRARY_ARCHITECTURE)
+  set(CMAKE_LIBRARY_ARCHITECTURE "")
+endif()
+
+set(CMAKE_CXX_CL_SHOWINCLUDES_PREFIX "")
+if(CMAKE_CXX_CL_SHOWINCLUDES_PREFIX)
+  set(CMAKE_CL_SHOWINCLUDES_PREFIX "${CMAKE_CXX_CL_SHOWINCLUDES_PREFIX}")
+endif()
+
+
+
+
+set(CMAKE_CXX_IMPLICIT_LINK_LIBRARIES "stdc++;m;c")
+set(CMAKE_CXX_IMPLICIT_LINK_DIRECTORIES "/usr/lib64/gcc/x86_64-suse-linux/4.8;/usr/lib64;/lib64;/usr/x86_64-suse-linux/lib")
+set(CMAKE_CXX_IMPLICIT_LINK_FRAMEWORK_DIRECTORIES "")
diff --git a/thirdparty/bmt/build/CMakeFiles/3.5.2/CMakeDetermineCompilerABI_CXX.bin b/thirdparty/bmt/build/CMakeFiles/3.5.2/CMakeDetermineCompilerABI_CXX.bin
new file mode 100755
index 0000000000000000000000000000000000000000..246feb2541717c984b88541f0c946b911aefb404
GIT binary patch
literal 12552
zcmeHNeQ;FQb-(YuU1^2HY9&M>z%^?d46(B-@nI}O1iKP~6%ii>$tJ<#vD$souD#lo
z_X7*pv1zb9Rvi+Wc4{Wgho>Ktw&Tn=(@aY<P7EPq>gi-k;<)1^{X@kj1sm6KOc^>g
zR)6Qddsc7Xt~z);{UbBJJMW&4d(OG%e!TncxpyDy8`z?2n&701je@8^eMGYCT!`2T
zNh)5KXb=&xLM#(?;CS&wWC=kVV|Z1WX6RFL2A~H|mu@q<^oT?l-e9u@L-vp$)i;|W
z3bo-4^K6o#5JAZxJ@WPJd`SWxkO;#A%8!(StVbm2T~K-#lpe!?l4r>6qqfmAsLBWR
z2q>Z>1jJ^U&q9)=wCj8^N%&=D=&~<DY|u7bD=8ruvfka$BR{V<1(e`!Ro`6w(y9Ds
z$m7tFNsp{s+mT6zGU;q-GBnw>F0^iKxR494mhC3{B)fITkgSQun-lrmhKG1me(Sj<
zS7VR*etYT{|NPc3|8_98Y5v7r3(8vYP~twV9Curv<eKsL@z4;F?Rq@a7wjugnbgz}
z<Qp}s0Z*eWsm_|JS&~COlWo$cdX5DS`@~Ic4L*-&{!<<ip)#^db==?3`>)@B{i~1k
z_Pu(kY5g;et4DtI_H&<o{p2ry^7=PldvNa!_doWvNB;5XyZ)xObKiXHm3OEKb0)GE
ztWmmj4*t{NbN@Nj*32Qla}NFs;0MHF5eysvHk-}^4TvS8%f)|F@)wIv7ypOg2W!~*
z5%{ebL0)fe15N2Wcv|o~spdV;^OwOV`H13MB#iV6JminN-s9kt{I&WBFd;U`T!>cy
zTSb#NQ5R7h8+=3Zo5l0WzMB5x+mas;-&6du65fMCF`2micAER^glUeA=dxy@7|$0?
zQ<&Qhn!ByBbfIYFdo%Gu!72!|e{j2*wDO8QxV<-%%UXl+k&I=R)ht0h<8~Fm#u5p$
zK-4M$D#29~uBsl7r?YTIauS7-IhxL<VSOixlj*EkDp*MrC*#Gqt>Bb}Xc6>Cp}=KU
z7Sa|;i>(9wJ-z1Y@apg#+y{0aX?Wji{cI^J`2(Odk13TFr}3KT;$52*SDOz=%>CZB
zt9eB4nsW__9l7L-J?*`tAt+)`oGf}Ttbsu6$d7%erI~UzbH~jPnOVLJdGpc;Frrgb
zesOvRY58w`K{0xa6;{37KlRi7v8i*hBX9g>*I@sdlQa`zXHL<Ci=8<`^XS4qK=1Ns
z^U}>ioVXJX4#h$=?_x);`eRe)AJT_UpCEhW*yZ6Dsc2@pdFf%|ogj+kPhB-2ADjA7
z?9`i^V%pi*Pp_7s{CyIPJ@GRr$EIGg_0K8&!`+AAuPC()LG!|+aAf!;?<iQ>d#9;=
zu_N8SItUbREW6M)^x`zWT9BVPJACo?q0+omTr2>Rjj1nOicOWK&-A@dwLaZ<DR#X1
z?+>3vYny*~QtNx4Ip-am_lHr}<0LPl_eQ5?qWAPqU5O6$PrVu)95}js(i=n*kA?{D
z8Ti!sRlkazx@ORbUFiE@W+wK;+r=AMe_-m(fvGE-VPmH4-(p8jYq7iDD!oYqxo<dn
zUvxOSKWd&nHX3I6_b|S4?A1_fawV3o$gF?ImYwSh>4&Y{sF|E7=J$1W4GYPPkEHiu
zvonnE!V+UcC#1c2E?t|Mc?9qVI5q&-fNuFqzyZJ?1HJ&*gbxXQuPA8`?-tr*KwHvO
z?>nZ|2Z&F5lz)Pq+ekpA+VTQBCR;uQ`5?*q16%y<+nO66@*NhN7H_z1&9a+`O*S{<
z8HQhS&5HU1PwKt%>kgt~Rh9%ijB>dK_V@!|(fj=EUpD&u!K0o&|H?0UWB$%3>tg<{
zBlX+;k&M49>hFyDSN8aWJ^psc_xOFXUC+VJU&9Wsbs#@ZANRn=J@7l&183Df=XnLW
z30T5pa>KNXUvSa%&8F03gYte!g+t~2&>~gN@zfGU(^^Gou?@s)Dt@B%jS`o8Y*HN8
zBNF?s4`y<NbQ(5IYc-{aHzFbJ6DS>1<@0P(TvL$u_3esYs33##UIL$EO1$>5Jg?uh
z@1XQa8;ED?A`)Z%Q;OzxuzrJ*XBx|^Y)?wb<Fg`Z9*^HBdZQ{=>i_S6<5+h^SLX0z
zYj5xR;L1Hid-{R{6!?8AxHjB1yA>3lh9SEz?H+OO>)rLxJkUi6Irm_t#Ajsrd*yQ8
zuk(2Fe%w>3R;Fx!_Mw|?`y4mq%Mz7{UEklK3AX%CR6p?iJArKwC8j0e$Q}f)+AoKe
z+rM7DkKu4f;ZPxIO%ys1GMfmYWvCF|BRW#KajPRfQc7o%9r=lI8|fM9AK2V6M*H{>
zHr=Z`u$iZzIT+m&1icOsfOhy{RO*qRa-a4PaP1M)JpklB@<#JfAgJs0K?0t7a$Wby
z39M^%G|_e4y9U7Uc+=S;%DfTiL!_U8R!8r$ryH5pK8d`+;GoRA6{JYw(Zz1$KJ5rG
zkgA)R$QRQet(AlD#v^O<X)gfRCXrDq-DCP6;HgF!yQv93-y}J*$sW6r-V|BrWmO9T
zl(etEj=U}*3%5uX$o-O}5~{B5b7k^4$ofAe@}4U|<=l7`NJCm^_pSx<2A+9G)tqV6
zhV;ioqc*5Niu`W<3n*+VsKQ;aBIf%p%KRPVs^k4+rujLl!*mCw+Sdb1Q9w@%be99R
zu1B_jJYPWm%1pE!0aN>Q8}hb4BQ#9t!aSjOs_}aegim_{ImJRX$1nTQ#!qVzfw~s0
zRWp1`e9L_cXdY@U+H(JbM#I0vAHWi;FB6T88$n_+Dip7wXRR>&>l-&T_Al79NXLrm
zy;bxtklL^~R9D*`YN~hJ{6yIhtj<>84Qrbg*;TAqM0%0-XL=TCsG`nW*3jW!{j4YG
zEsW<UNvHu_pY0zB&71n7hIZj>=?}k%@TC2Zdf;*sn4(2n1@ZaKXe9aXx0}%%7By-|
zsH|!I3&?)UO=xTDLT!a~3H?UOZL_XyTJ#xweiu5;5Cfg-?w-Bi>Vje)P5U4JeB|=z
z7rts$BQ6B5OejVpbzaJhjhYb%_uR5^`!d5HEFGNOrz5)8ZjrWsua<2=Cko*NnGpx@
z=%NGp^|+u`4dnqZ{>CP4pw!)=dtn2P@i)Nu7oq7M@V+MN_<XH8I!cASI=jRnidut1
z1)PW22bmrCoOPtLiA*VpHNnb^hGu=o(Mi%84X1FFfz+V$;$fT?n$hZOHL^vcHE8GE
zAtSrZXx?@oR$TpM%FcY%_l&N?yJ6|whv)%%?^KoiIWksgpOw81H~5oe+^%>yP8tiA
zvW4_m)=CCxNiP$LR6H*Vsa(ETDPXfm)OAMmA-_t!_I4l`8ZVZzR`-~dwesl%$ap@H
z>b4JwgQ2ltXy@u+2>sbTmMz_NS1>dR*mW2B8mGw?$avvUHj&Ebvbj<rRLVY-&L%@e
zIw32daNNp|S)q6)6H4T=n55aF%`Ft-;AgDtSTWVz3Bi1>l*MV+7^*L(K)VeUGlkH2
zE@@@DX^_+LOvswF5<>PL^#z?H6c0^U5;2Ry${w5$@%-39$`VyaW{%lnnq<+8=kxJH
z>JW{LLB+wBr)sVvI{LK#Zgt|W{$6zL99>B-3Z>OHz%(9Wl(r3&G+{EzUty1UPaFD{
zU0j{B0Zj}tI-;aBv7b?=t-<TCbvN5OP_{M9sMFTnRkqeBg)RlLz<!JId&<W4D9i;A
z2gt;1UgG#g=e3Pra+YIA_aBt-iIIupu%M!EQt&nf*;#f>vCF{i5Bu@6x`>ovKYpOL
zEoH^)Kn#p^mmaB-H1I(ftjRY-u_oVBjuV~64D8A|UaaKTmE*)petkKftmOMdtvJlU
z%?`(JmGbih$AOjn#&Wz^$*;b5pmGB@SktPXE9J2ts*gBX5;q+8CeA_wr|dj$D*3bb
zDV6-z^8I2Ze}P!p5OK03ZXnd)INw=lh=t|&zLNh5K_}4eWFV@p6@MC-&3qr=)G-8R
zcc7M?>ia3uGiuxi*?fvRtCJyU_fnk<abq<T978OfgCCrO&u?R8u_11*WGKoIw5O^}
zZ-DfOcV{ZOw2Ic|H-WE->i(y7LZ^E)*Sw-wkN6WVepvCJckwq%`?L3Ma-RYoEq7Xt
z`9Ps6*+1*D^Q4roo>zYbey#TYK+3lWx~QPU=K-$($3|Pwbp<87IWkQnuVrVhetQe@
znphy(=S6_crvC<7t3OGimVK}6km~oV9el0Ed=5)~_5CS@bsA35R=Vc@R%ySj>b={m
z`0n@g0cofDy_=6ii(2=yMaa|rsXO#+fqbod+s7dvq|R`qKc0jCsI*_bUVU5gT}D8@
zG>4sY;Mck@es>P}M)+UrUb|hji}&`Ha$%*EpPlFI*&f)hbq~H)*>~SZZ&3E#_t7^?
zevR(}BA`OCG&+j#$uzh1?luSd_YC4@dG-R?ERLHAx<ba)vYE`8W0~AYJYyz{xqQKl
zmnKCbH$IWEidHhbZq1rIYYWK>Vzpnjvc>!%F`AE$TV}E}K7I%)4ytksOQOBo;}2N9
zd-wLDqIAZ}M|=9iiHQkeetuxpx^#9lXF6`=h}$KZxRG;4Sf;sUcXWH7*|%de-JHwD
zGLCJr*%wm+vCX@MxpiP?PjtZCxn;|qzCm*^+B48cO1Mog8%P#%W-6Xd(!F}6({%Uk
z*oezOSD2eWw<Ef}zZYe$YxC>4)3;TmA(QUjBN69C`Tu?8UwJ7{{|>164}cGGT`zCy
z<zE0qxNvA3krQAsZ^IO02yv|Zga~JIMJtRkLxeS^D1>8Sc>ozoCq<a3RJ@Q9;pCw#
z?AfrGw@VIM`9eCEtsqR4<*iJd6cjd*DT*+4x(EXekL7?Atx4qazX{=dPI?`-QfjiL
zk}$%Qt!htDTZ8fY6A(i+@$qy5KIDpI9vu3!LKy#H5Mj*!am;D(tdY_f^y1kuyex{O
zsmOSIWF&7LWQt-@#?fbtW^$;<Hl-}_V(<|leRr{x;M-^?-Y4__<J>zIl65B9DbV>Q
z>+^n^|9=*A3Iq4J=vAPd`n>NhDuc^h@^1YY%Ds5F{^NQCl)ye+0`Vs{kpG<`)IMFG
z^@H{iS8xZy`n-?tG;BpW$);3igR;IN#SS2J=fnEPjEF2cru3gz9Y$>=l<ViXiQ*v=
zWqrP9icq7FuGyfRyZe7XN)S7h{pa{$qjH%0kNxL*7(M`fx+7wm<BXJ22&$Uh{wuby
zuK;HK=Xm5PrO)l;`rYk+9ENC2dz8Uv>mm|+wuV0K9o_n029NrL?Q<M^LNOLgYqJ^E
z<xEo`ochzM{%NHjbP59xD4HS7FQ<MvUZM^|Vt;L_n*Ig~oces&8BqFvPKA!d^)l-(
zfH?Iz{`#J(nC}Bw-rfGcQTjaoC)NJ-q|#r*iVC{x|E^2_veM`Ok*;;=yX$|^rO$ES
zO6B-vm%h6{Uq(6YwbkoZjt}{NrEYmQO}8e^4t>5Seob98yXD>VzeAV$pX=v%bNV{<
ze+^xlYOK$3DgW=6|KF>oLwP>&I2?D?&++S3Y7~;&U#1zp0~x3P<v3SW5F{gx#Ci;`
zqRgq!|JMvEefE=0upZMN03jZ?pZBeuO3^J(bvcs(2Jt$bOrn)OuRmPO`V8qmR*bU@
z!CmaQtB~0&yJRkXbwIX+8>SRo=ERCQ^dD%m73c=dnOvl}V-EfQRDtQQT?#HzM3ug~
z|9*}OB}z~8!eEo-zG6EAN?#S1fvFZ=>K|uvk>cJt^be~)a-4N419$g7b+fCBrN;Df
vThVe==pto%TuqdUyVS;KmwNGT1-Bay&qtnDv`p2i|Jhrl84+<Ach~>FoWtfX

literal 0
HcmV?d00001

diff --git a/thirdparty/bmt/build/CMakeFiles/3.5.2/CMakeSystem.cmake b/thirdparty/bmt/build/CMakeFiles/3.5.2/CMakeSystem.cmake
new file mode 100644
index 0000000..e822e95
--- /dev/null
+++ b/thirdparty/bmt/build/CMakeFiles/3.5.2/CMakeSystem.cmake
@@ -0,0 +1,15 @@
+set(CMAKE_HOST_SYSTEM "Linux-4.4.73-5.1_4.0.141-cray_ari_s")
+set(CMAKE_HOST_SYSTEM_NAME "Linux")
+set(CMAKE_HOST_SYSTEM_VERSION "4.4.73-5.1_4.0.141-cray_ari_s")
+set(CMAKE_HOST_SYSTEM_PROCESSOR "x86_64")
+
+
+
+set(CMAKE_SYSTEM "Linux-4.4.73-5.1_4.0.141-cray_ari_s")
+set(CMAKE_SYSTEM_NAME "Linux")
+set(CMAKE_SYSTEM_VERSION "4.4.73-5.1_4.0.141-cray_ari_s")
+set(CMAKE_SYSTEM_PROCESSOR "x86_64")
+
+set(CMAKE_CROSSCOMPILING "FALSE")
+
+set(CMAKE_SYSTEM_LOADED 1)
diff --git a/thirdparty/bmt/build/CMakeFiles/3.5.2/CompilerIdCXX/CMakeCXXCompilerId.cpp b/thirdparty/bmt/build/CMakeFiles/3.5.2/CompilerIdCXX/CMakeCXXCompilerId.cpp
new file mode 100644
index 0000000..e6d8536
--- /dev/null
+++ b/thirdparty/bmt/build/CMakeFiles/3.5.2/CompilerIdCXX/CMakeCXXCompilerId.cpp
@@ -0,0 +1,533 @@
+/* This source file must have a .cpp extension so that all C++ compilers
+   recognize the extension without flags.  Borland does not know .cxx for
+   example.  */
+#ifndef __cplusplus
+# error "A C compiler has been selected for C++."
+#endif
+
+
+/* Version number components: V=Version, R=Revision, P=Patch
+   Version date components:   YYYY=Year, MM=Month,   DD=Day  */
+
+#if defined(__COMO__)
+# define COMPILER_ID "Comeau"
+  /* __COMO_VERSION__ = VRR */
+# define COMPILER_VERSION_MAJOR DEC(__COMO_VERSION__ / 100)
+# define COMPILER_VERSION_MINOR DEC(__COMO_VERSION__ % 100)
+
+#elif defined(__INTEL_COMPILER) || defined(__ICC)
+# define COMPILER_ID "Intel"
+# if defined(_MSC_VER)
+#  define SIMULATE_ID "MSVC"
+# endif
+  /* __INTEL_COMPILER = VRP */
+# define COMPILER_VERSION_MAJOR DEC(__INTEL_COMPILER/100)
+# define COMPILER_VERSION_MINOR DEC(__INTEL_COMPILER/10 % 10)
+# if defined(__INTEL_COMPILER_UPDATE)
+#  define COMPILER_VERSION_PATCH DEC(__INTEL_COMPILER_UPDATE)
+# else
+#  define COMPILER_VERSION_PATCH DEC(__INTEL_COMPILER   % 10)
+# endif
+# if defined(__INTEL_COMPILER_BUILD_DATE)
+  /* __INTEL_COMPILER_BUILD_DATE = YYYYMMDD */
+#  define COMPILER_VERSION_TWEAK DEC(__INTEL_COMPILER_BUILD_DATE)
+# endif
+# if defined(_MSC_VER)
+   /* _MSC_VER = VVRR */
+#  define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
+#  define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
+# endif
+
+#elif defined(__PATHCC__)
+# define COMPILER_ID "PathScale"
+# define COMPILER_VERSION_MAJOR DEC(__PATHCC__)
+# define COMPILER_VERSION_MINOR DEC(__PATHCC_MINOR__)
+# if defined(__PATHCC_PATCHLEVEL__)
+#  define COMPILER_VERSION_PATCH DEC(__PATHCC_PATCHLEVEL__)
+# endif
+
+#elif defined(__BORLANDC__) && defined(__CODEGEARC_VERSION__)
+# define COMPILER_ID "Embarcadero"
+# define COMPILER_VERSION_MAJOR HEX(__CODEGEARC_VERSION__>>24 & 0x00FF)
+# define COMPILER_VERSION_MINOR HEX(__CODEGEARC_VERSION__>>16 & 0x00FF)
+# define COMPILER_VERSION_PATCH DEC(__CODEGEARC_VERSION__     & 0xFFFF)
+
+#elif defined(__BORLANDC__)
+# define COMPILER_ID "Borland"
+  /* __BORLANDC__ = 0xVRR */
+# define COMPILER_VERSION_MAJOR HEX(__BORLANDC__>>8)
+# define COMPILER_VERSION_MINOR HEX(__BORLANDC__ & 0xFF)
+
+#elif defined(__WATCOMC__) && __WATCOMC__ < 1200
+# define COMPILER_ID "Watcom"
+   /* __WATCOMC__ = VVRR */
+# define COMPILER_VERSION_MAJOR DEC(__WATCOMC__ / 100)
+# define COMPILER_VERSION_MINOR DEC((__WATCOMC__ / 10) % 10)
+# if (__WATCOMC__ % 10) > 0
+#  define COMPILER_VERSION_PATCH DEC(__WATCOMC__ % 10)
+# endif
+
+#elif defined(__WATCOMC__)
+# define COMPILER_ID "OpenWatcom"
+   /* __WATCOMC__ = VVRP + 1100 */
+# define COMPILER_VERSION_MAJOR DEC((__WATCOMC__ - 1100) / 100)
+# define COMPILER_VERSION_MINOR DEC((__WATCOMC__ / 10) % 10)
+# if (__WATCOMC__ % 10) > 0
+#  define COMPILER_VERSION_PATCH DEC(__WATCOMC__ % 10)
+# endif
+
+#elif defined(__SUNPRO_CC)
+# define COMPILER_ID "SunPro"
+# if __SUNPRO_CC >= 0x5100
+   /* __SUNPRO_CC = 0xVRRP */
+#  define COMPILER_VERSION_MAJOR HEX(__SUNPRO_CC>>12)
+#  define COMPILER_VERSION_MINOR HEX(__SUNPRO_CC>>4 & 0xFF)
+#  define COMPILER_VERSION_PATCH HEX(__SUNPRO_CC    & 0xF)
+# else
+   /* __SUNPRO_CC = 0xVRP */
+#  define COMPILER_VERSION_MAJOR HEX(__SUNPRO_CC>>8)
+#  define COMPILER_VERSION_MINOR HEX(__SUNPRO_CC>>4 & 0xF)
+#  define COMPILER_VERSION_PATCH HEX(__SUNPRO_CC    & 0xF)
+# endif
+
+#elif defined(__HP_aCC)
+# define COMPILER_ID "HP"
+  /* __HP_aCC = VVRRPP */
+# define COMPILER_VERSION_MAJOR DEC(__HP_aCC/10000)
+# define COMPILER_VERSION_MINOR DEC(__HP_aCC/100 % 100)
+# define COMPILER_VERSION_PATCH DEC(__HP_aCC     % 100)
+
+#elif defined(__DECCXX)
+# define COMPILER_ID "Compaq"
+  /* __DECCXX_VER = VVRRTPPPP */
+# define COMPILER_VERSION_MAJOR DEC(__DECCXX_VER/10000000)
+# define COMPILER_VERSION_MINOR DEC(__DECCXX_VER/100000  % 100)
+# define COMPILER_VERSION_PATCH DEC(__DECCXX_VER         % 10000)
+
+#elif defined(__IBMCPP__) && defined(__COMPILER_VER__)
+# define COMPILER_ID "zOS"
+  /* __IBMCPP__ = VRP */
+# define COMPILER_VERSION_MAJOR DEC(__IBMCPP__/100)
+# define COMPILER_VERSION_MINOR DEC(__IBMCPP__/10 % 10)
+# define COMPILER_VERSION_PATCH DEC(__IBMCPP__    % 10)
+
+#elif defined(__IBMCPP__) && !defined(__COMPILER_VER__) && __IBMCPP__ >= 800
+# define COMPILER_ID "XL"
+  /* __IBMCPP__ = VRP */
+# define COMPILER_VERSION_MAJOR DEC(__IBMCPP__/100)
+# define COMPILER_VERSION_MINOR DEC(__IBMCPP__/10 % 10)
+# define COMPILER_VERSION_PATCH DEC(__IBMCPP__    % 10)
+
+#elif defined(__IBMCPP__) && !defined(__COMPILER_VER__) && __IBMCPP__ < 800
+# define COMPILER_ID "VisualAge"
+  /* __IBMCPP__ = VRP */
+# define COMPILER_VERSION_MAJOR DEC(__IBMCPP__/100)
+# define COMPILER_VERSION_MINOR DEC(__IBMCPP__/10 % 10)
+# define COMPILER_VERSION_PATCH DEC(__IBMCPP__    % 10)
+
+#elif defined(__PGI)
+# define COMPILER_ID "PGI"
+# define COMPILER_VERSION_MAJOR DEC(__PGIC__)
+# define COMPILER_VERSION_MINOR DEC(__PGIC_MINOR__)
+# if defined(__PGIC_PATCHLEVEL__)
+#  define COMPILER_VERSION_PATCH DEC(__PGIC_PATCHLEVEL__)
+# endif
+
+#elif defined(_CRAYC)
+# define COMPILER_ID "Cray"
+# define COMPILER_VERSION_MAJOR DEC(_RELEASE_MAJOR)
+# define COMPILER_VERSION_MINOR DEC(_RELEASE_MINOR)
+
+#elif defined(__TI_COMPILER_VERSION__)
+# define COMPILER_ID "TI"
+  /* __TI_COMPILER_VERSION__ = VVVRRRPPP */
+# define COMPILER_VERSION_MAJOR DEC(__TI_COMPILER_VERSION__/1000000)
+# define COMPILER_VERSION_MINOR DEC(__TI_COMPILER_VERSION__/1000   % 1000)
+# define COMPILER_VERSION_PATCH DEC(__TI_COMPILER_VERSION__        % 1000)
+
+#elif defined(__FUJITSU) || defined(__FCC_VERSION) || defined(__fcc_version)
+# define COMPILER_ID "Fujitsu"
+
+#elif defined(__SCO_VERSION__)
+# define COMPILER_ID "SCO"
+
+#elif defined(__clang__) && defined(__apple_build_version__)
+# define COMPILER_ID "AppleClang"
+# if defined(_MSC_VER)
+#  define SIMULATE_ID "MSVC"
+# endif
+# define COMPILER_VERSION_MAJOR DEC(__clang_major__)
+# define COMPILER_VERSION_MINOR DEC(__clang_minor__)
+# define COMPILER_VERSION_PATCH DEC(__clang_patchlevel__)
+# if defined(_MSC_VER)
+   /* _MSC_VER = VVRR */
+#  define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
+#  define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
+# endif
+# define COMPILER_VERSION_TWEAK DEC(__apple_build_version__)
+
+#elif defined(__clang__)
+# define COMPILER_ID "Clang"
+# if defined(_MSC_VER)
+#  define SIMULATE_ID "MSVC"
+# endif
+# define COMPILER_VERSION_MAJOR DEC(__clang_major__)
+# define COMPILER_VERSION_MINOR DEC(__clang_minor__)
+# define COMPILER_VERSION_PATCH DEC(__clang_patchlevel__)
+# if defined(_MSC_VER)
+   /* _MSC_VER = VVRR */
+#  define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
+#  define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
+# endif
+
+#elif defined(__GNUC__)
+# define COMPILER_ID "GNU"
+# define COMPILER_VERSION_MAJOR DEC(__GNUC__)
+# if defined(__GNUC_MINOR__)
+#  define COMPILER_VERSION_MINOR DEC(__GNUC_MINOR__)
+# endif
+# if defined(__GNUC_PATCHLEVEL__)
+#  define COMPILER_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
+# endif
+
+#elif defined(_MSC_VER)
+# define COMPILER_ID "MSVC"
+  /* _MSC_VER = VVRR */
+# define COMPILER_VERSION_MAJOR DEC(_MSC_VER / 100)
+# define COMPILER_VERSION_MINOR DEC(_MSC_VER % 100)
+# if defined(_MSC_FULL_VER)
+#  if _MSC_VER >= 1400
+    /* _MSC_FULL_VER = VVRRPPPPP */
+#   define COMPILER_VERSION_PATCH DEC(_MSC_FULL_VER % 100000)
+#  else
+    /* _MSC_FULL_VER = VVRRPPPP */
+#   define COMPILER_VERSION_PATCH DEC(_MSC_FULL_VER % 10000)
+#  endif
+# endif
+# if defined(_MSC_BUILD)
+#  define COMPILER_VERSION_TWEAK DEC(_MSC_BUILD)
+# endif
+
+#elif defined(__VISUALDSPVERSION__) || defined(__ADSPBLACKFIN__) || defined(__ADSPTS__) || defined(__ADSP21000__)
+# define COMPILER_ID "ADSP"
+#if defined(__VISUALDSPVERSION__)
+  /* __VISUALDSPVERSION__ = 0xVVRRPP00 */
+# define COMPILER_VERSION_MAJOR HEX(__VISUALDSPVERSION__>>24)
+# define COMPILER_VERSION_MINOR HEX(__VISUALDSPVERSION__>>16 & 0xFF)
+# define COMPILER_VERSION_PATCH HEX(__VISUALDSPVERSION__>>8  & 0xFF)
+#endif
+
+#elif defined(__IAR_SYSTEMS_ICC__ ) || defined(__IAR_SYSTEMS_ICC)
+# define COMPILER_ID "IAR"
+
+#elif defined(__ARMCC_VERSION)
+# define COMPILER_ID "ARMCC"
+#if __ARMCC_VERSION >= 1000000
+  /* __ARMCC_VERSION = VRRPPPP */
+  # define COMPILER_VERSION_MAJOR DEC(__ARMCC_VERSION/1000000)
+  # define COMPILER_VERSION_MINOR DEC(__ARMCC_VERSION/10000 % 100)
+  # define COMPILER_VERSION_PATCH DEC(__ARMCC_VERSION     % 10000)
+#else
+  /* __ARMCC_VERSION = VRPPPP */
+  # define COMPILER_VERSION_MAJOR DEC(__ARMCC_VERSION/100000)
+  # define COMPILER_VERSION_MINOR DEC(__ARMCC_VERSION/10000 % 10)
+  # define COMPILER_VERSION_PATCH DEC(__ARMCC_VERSION    % 10000)
+#endif
+
+
+#elif defined(_SGI_COMPILER_VERSION) || defined(_COMPILER_VERSION)
+# define COMPILER_ID "MIPSpro"
+# if defined(_SGI_COMPILER_VERSION)
+  /* _SGI_COMPILER_VERSION = VRP */
+#  define COMPILER_VERSION_MAJOR DEC(_SGI_COMPILER_VERSION/100)
+#  define COMPILER_VERSION_MINOR DEC(_SGI_COMPILER_VERSION/10 % 10)
+#  define COMPILER_VERSION_PATCH DEC(_SGI_COMPILER_VERSION    % 10)
+# else
+  /* _COMPILER_VERSION = VRP */
+#  define COMPILER_VERSION_MAJOR DEC(_COMPILER_VERSION/100)
+#  define COMPILER_VERSION_MINOR DEC(_COMPILER_VERSION/10 % 10)
+#  define COMPILER_VERSION_PATCH DEC(_COMPILER_VERSION    % 10)
+# endif
+
+
+/* These compilers are either not known or too old to define an
+  identification macro.  Try to identify the platform and guess that
+  it is the native compiler.  */
+#elif defined(__sgi)
+# define COMPILER_ID "MIPSpro"
+
+#elif defined(__hpux) || defined(__hpua)
+# define COMPILER_ID "HP"
+
+#else /* unknown compiler */
+# define COMPILER_ID ""
+#endif
+
+/* Construct the string literal in pieces to prevent the source from
+   getting matched.  Store it in a pointer rather than an array
+   because some compilers will just produce instructions to fill the
+   array rather than assigning a pointer to a static array.  */
+char const* info_compiler = "INFO" ":" "compiler[" COMPILER_ID "]";
+#ifdef SIMULATE_ID
+char const* info_simulate = "INFO" ":" "simulate[" SIMULATE_ID "]";
+#endif
+
+#ifdef __QNXNTO__
+char const* qnxnto = "INFO" ":" "qnxnto[]";
+#endif
+
+#if defined(__CRAYXE) || defined(__CRAYXC)
+char const *info_cray = "INFO" ":" "compiler_wrapper[CrayPrgEnv]";
+#endif
+
+#define STRINGIFY_HELPER(X) #X
+#define STRINGIFY(X) STRINGIFY_HELPER(X)
+
+/* Identify known platforms by name.  */
+#if defined(__linux) || defined(__linux__) || defined(linux)
+# define PLATFORM_ID "Linux"
+
+#elif defined(__CYGWIN__)
+# define PLATFORM_ID "Cygwin"
+
+#elif defined(__MINGW32__)
+# define PLATFORM_ID "MinGW"
+
+#elif defined(__APPLE__)
+# define PLATFORM_ID "Darwin"
+
+#elif defined(_WIN32) || defined(__WIN32__) || defined(WIN32)
+# define PLATFORM_ID "Windows"
+
+#elif defined(__FreeBSD__) || defined(__FreeBSD)
+# define PLATFORM_ID "FreeBSD"
+
+#elif defined(__NetBSD__) || defined(__NetBSD)
+# define PLATFORM_ID "NetBSD"
+
+#elif defined(__OpenBSD__) || defined(__OPENBSD)
+# define PLATFORM_ID "OpenBSD"
+
+#elif defined(__sun) || defined(sun)
+# define PLATFORM_ID "SunOS"
+
+#elif defined(_AIX) || defined(__AIX) || defined(__AIX__) || defined(__aix) || defined(__aix__)
+# define PLATFORM_ID "AIX"
+
+#elif defined(__sgi) || defined(__sgi__) || defined(_SGI)
+# define PLATFORM_ID "IRIX"
+
+#elif defined(__hpux) || defined(__hpux__)
+# define PLATFORM_ID "HP-UX"
+
+#elif defined(__HAIKU__)
+# define PLATFORM_ID "Haiku"
+
+#elif defined(__BeOS) || defined(__BEOS__) || defined(_BEOS)
+# define PLATFORM_ID "BeOS"
+
+#elif defined(__QNX__) || defined(__QNXNTO__)
+# define PLATFORM_ID "QNX"
+
+#elif defined(__tru64) || defined(_tru64) || defined(__TRU64__)
+# define PLATFORM_ID "Tru64"
+
+#elif defined(__riscos) || defined(__riscos__)
+# define PLATFORM_ID "RISCos"
+
+#elif defined(__sinix) || defined(__sinix__) || defined(__SINIX__)
+# define PLATFORM_ID "SINIX"
+
+#elif defined(__UNIX_SV__)
+# define PLATFORM_ID "UNIX_SV"
+
+#elif defined(__bsdos__)
+# define PLATFORM_ID "BSDOS"
+
+#elif defined(_MPRAS) || defined(MPRAS)
+# define PLATFORM_ID "MP-RAS"
+
+#elif defined(__osf) || defined(__osf__)
+# define PLATFORM_ID "OSF1"
+
+#elif defined(_SCO_SV) || defined(SCO_SV) || defined(sco_sv)
+# define PLATFORM_ID "SCO_SV"
+
+#elif defined(__ultrix) || defined(__ultrix__) || defined(_ULTRIX)
+# define PLATFORM_ID "ULTRIX"
+
+#elif defined(__XENIX__) || defined(_XENIX) || defined(XENIX)
+# define PLATFORM_ID "Xenix"
+
+#elif defined(__WATCOMC__)
+# if defined(__LINUX__)
+#  define PLATFORM_ID "Linux"
+
+# elif defined(__DOS__)
+#  define PLATFORM_ID "DOS"
+
+# elif defined(__OS2__)
+#  define PLATFORM_ID "OS2"
+
+# elif defined(__WINDOWS__)
+#  define PLATFORM_ID "Windows3x"
+
+# else /* unknown platform */
+#  define PLATFORM_ID ""
+# endif
+
+#else /* unknown platform */
+# define PLATFORM_ID ""
+
+#endif
+
+/* For windows compilers MSVC and Intel we can determine
+   the architecture of the compiler being used.  This is because
+   the compilers do not have flags that can change the architecture,
+   but rather depend on which compiler is being used
+*/
+#if defined(_WIN32) && defined(_MSC_VER)
+# if defined(_M_IA64)
+#  define ARCHITECTURE_ID "IA64"
+
+# elif defined(_M_X64) || defined(_M_AMD64)
+#  define ARCHITECTURE_ID "x64"
+
+# elif defined(_M_IX86)
+#  define ARCHITECTURE_ID "X86"
+
+# elif defined(_M_ARM)
+#  if _M_ARM == 4
+#   define ARCHITECTURE_ID "ARMV4I"
+#  elif _M_ARM == 5
+#   define ARCHITECTURE_ID "ARMV5I"
+#  else
+#   define ARCHITECTURE_ID "ARMV" STRINGIFY(_M_ARM)
+#  endif
+
+# elif defined(_M_MIPS)
+#  define ARCHITECTURE_ID "MIPS"
+
+# elif defined(_M_SH)
+#  define ARCHITECTURE_ID "SHx"
+
+# else /* unknown architecture */
+#  define ARCHITECTURE_ID ""
+# endif
+
+#elif defined(__WATCOMC__)
+# if defined(_M_I86)
+#  define ARCHITECTURE_ID "I86"
+
+# elif defined(_M_IX86)
+#  define ARCHITECTURE_ID "X86"
+
+# else /* unknown architecture */
+#  define ARCHITECTURE_ID ""
+# endif
+
+#else
+#  define ARCHITECTURE_ID ""
+#endif
+
+/* Convert integer to decimal digit literals.  */
+#define DEC(n)                   \
+  ('0' + (((n) / 10000000)%10)), \
+  ('0' + (((n) / 1000000)%10)),  \
+  ('0' + (((n) / 100000)%10)),   \
+  ('0' + (((n) / 10000)%10)),    \
+  ('0' + (((n) / 1000)%10)),     \
+  ('0' + (((n) / 100)%10)),      \
+  ('0' + (((n) / 10)%10)),       \
+  ('0' +  ((n) % 10))
+
+/* Convert integer to hex digit literals.  */
+#define HEX(n)             \
+  ('0' + ((n)>>28 & 0xF)), \
+  ('0' + ((n)>>24 & 0xF)), \
+  ('0' + ((n)>>20 & 0xF)), \
+  ('0' + ((n)>>16 & 0xF)), \
+  ('0' + ((n)>>12 & 0xF)), \
+  ('0' + ((n)>>8  & 0xF)), \
+  ('0' + ((n)>>4  & 0xF)), \
+  ('0' + ((n)     & 0xF))
+
+/* Construct a string literal encoding the version number components. */
+#ifdef COMPILER_VERSION_MAJOR
+char const info_version[] = {
+  'I', 'N', 'F', 'O', ':',
+  'c','o','m','p','i','l','e','r','_','v','e','r','s','i','o','n','[',
+  COMPILER_VERSION_MAJOR,
+# ifdef COMPILER_VERSION_MINOR
+  '.', COMPILER_VERSION_MINOR,
+#  ifdef COMPILER_VERSION_PATCH
+   '.', COMPILER_VERSION_PATCH,
+#   ifdef COMPILER_VERSION_TWEAK
+    '.', COMPILER_VERSION_TWEAK,
+#   endif
+#  endif
+# endif
+  ']','\0'};
+#endif
+
+/* Construct a string literal encoding the version number components. */
+#ifdef SIMULATE_VERSION_MAJOR
+char const info_simulate_version[] = {
+  'I', 'N', 'F', 'O', ':',
+  's','i','m','u','l','a','t','e','_','v','e','r','s','i','o','n','[',
+  SIMULATE_VERSION_MAJOR,
+# ifdef SIMULATE_VERSION_MINOR
+  '.', SIMULATE_VERSION_MINOR,
+#  ifdef SIMULATE_VERSION_PATCH
+   '.', SIMULATE_VERSION_PATCH,
+#   ifdef SIMULATE_VERSION_TWEAK
+    '.', SIMULATE_VERSION_TWEAK,
+#   endif
+#  endif
+# endif
+  ']','\0'};
+#endif
+
+/* Construct the string literal in pieces to prevent the source from
+   getting matched.  Store it in a pointer rather than an array
+   because some compilers will just produce instructions to fill the
+   array rather than assigning a pointer to a static array.  */
+char const* info_platform = "INFO" ":" "platform[" PLATFORM_ID "]";
+char const* info_arch = "INFO" ":" "arch[" ARCHITECTURE_ID "]";
+
+
+
+
+const char* info_language_dialect_default = "INFO" ":" "dialect_default["
+#if __cplusplus >= 201402L
+  "14"
+#elif __cplusplus >= 201103L
+  "11"
+#else
+  "98"
+#endif
+"]";
+
+/*--------------------------------------------------------------------------*/
+
+int main(int argc, char* argv[])
+{
+  int require = 0;
+  require += info_compiler[argc];
+  require += info_platform[argc];
+#ifdef COMPILER_VERSION_MAJOR
+  require += info_version[argc];
+#endif
+#ifdef SIMULATE_ID
+  require += info_simulate[argc];
+#endif
+#ifdef SIMULATE_VERSION_MAJOR
+  require += info_simulate_version[argc];
+#endif
+#if defined(__CRAYXE) || defined(__CRAYXC)
+  require += info_cray[argc];
+#endif
+  require += info_language_dialect_default[argc];
+  (void)argv;
+  return require;
+}
diff --git a/thirdparty/bmt/build/CMakeFiles/3.5.2/CompilerIdCXX/a.out b/thirdparty/bmt/build/CMakeFiles/3.5.2/CompilerIdCXX/a.out
new file mode 100755
index 0000000000000000000000000000000000000000..2ae39adc6aed23d4eb9598ae92fa7c71a5ff3dc9
GIT binary patch
literal 12704
zcmeHNeQ+Dcb>F)KNl28yPdSw6ScoD;N^F3XL@SgWS^z&JK&2!~ByGhqbpnASfEphR
zaF8iCj+HR>gjJm?owm`W_E<A@XWW_OkEAn>Cv|GN%2>`HZ4{@ONv8cnb>mca?TI^b
zJxb!3`Mte;AaDRX@_713XS5jX``Gv1?!MdG-M4#>jEwHnHBE5R#odCa%bt*AIk^y5
zwn|d*`o$^{5?jS)(Fjfh{vlaH(7_m9SEd<ym7D?S!M|U382x%kA`EYJ*n%OqkRa8!
zoI(n9;1j9>hC=vdIq6YbFRzs(;7N%vJfqt26$M$3NYcBi^sXvBhOJ7TA@`5^M*j&_
zKB0#|5gj5RhGo7CNtV*sYj%QgyGfFOA&D^TbJ&6*>m7j}wev<(s}ek_>RZV#y{i2T
z`8o7t($l;8dNPSXCY>wI1?Kv91$OlXi}~OV*>AE>vilEB%9`kTbD}nP;!iv(|60pG
z9eZTcPmaXj`ql&Ai#&BA{rBJZQ5p4zs;_?Dj$N>&KKGyM;y;z$Xrgxd>j-aLfqxeK
z_B!%ASCBuv0$;#?MLQ9{{dV%%E@7I<Y(8fe?O4G!O<^85WgfMX>7s2FhBC2Y(JBfv
zI&siUSOvwNI5?Ea=d6j?bjEVZ>Xx9MtWyQBWIS#biP}!}$GK|4Rn^&8Iw$szMhAz?
z9l;&JJGrZB7{JGx_#&7*eo7nCtLP~`sRXaLoRk<nJ9-|Rx>`?4l!kTXScuHO>5ZIi
z$Ts^$<k3aD;p*)Wh|K@Mdr^vur8}olw6rCGyk%nu7||&zeErH2(v}^dXU;j&LTrC4
zy71!@k%d<y^S^j|Y$AGT@j_!rL@vG9E5W7jH-yC1??UgLnU;;iLOgdb8a5dTi~&UE
zulphkuRW+wU3`x0QG?!@dVz|TuC#1CL%ipRV)+-Z8<3AI{A1+BH}*!f%aI>nFG2Zx
zBp7-06(~m*UUKwbRr+TJe!ki-M5%2OnpeMohD^QG@Df<swTslh$oxRW3jzDKauYfy
zU$}x<2l7jor(XXfsI+VpuNMKy#=^sIMixp}E{(h=YdyQ}J{UVY@@C{b8r1TvX!(al
zZREYk*>wl%@ugtPvu8q>X3CPi^(Eh%x;S$#d~bMRDSS_K;ob0Lbm6DriP6Wme5J_`
zmpmRII6nHx*S7y_<i+0^^klDo^!=r!$fIxBx3z5iGPO^PF1#_i@a`~dEOq`{Wd5QS
z+5KOoH|T*Mp9<d}o(i7`n-|Z`1X=zX#(^9o%R@ri;Y^b=1XHyzdT8I_J@I^YHl48w
z$1&SX3CWtx#O#@TA$xq3rZ2^e72>Jm3QweC87pq%g`0_$GWPM^{ZqBpg=38Eo2RTo
zF`dsH?{!jNP$B({+(|~XZsV?AK$_R?_}$Xd=K=2r90iQRVNU}-5BLKr$75cD7m4Oe
zO4>t5g*Ml!b+2jip3|CIiBD^!7y6$f0hQ{@3#{0V{4?(@E%`~-*SgQwb)cpBLGKx{
zcf(zu+PQfXvB~Bz{!ha0=ZF#ZwLYc~t!>mZsM#q?fc-YgEh-=Mwf?C-;_LdVG2-(-
z?iulI`;&%<ulKRWh_8RX>7Xx^@%4v&y<y+BL7#un*9G}OpI7$ln^5mUe|e4}b2rmX
z58U*?O%L4kzz6fdIqYvJ%_=AfWV{!1!?Y4$e$hpLQPFD@JmU#TOoc<@=|+cMugW=|
z>Q*$zLmL$RGZjD4`a+4zJq{_(>miB#&-+VxLSEKG1j;eMoZ-;4zoc|k4-pWnWiG_u
zD%h-GmxAjQWKiCNVmn8P*B+MV^^^9ils@7Bky8C<{+Ob<AFPi}u}n;3d6WI=Rq|NY
zCCz?+ThW_Txl;c>2OP(`D|&W?c3e@<`<jAYY#b?dD!6}WXpeu}_~iJAU+%0v>F*2n
zFNYE04#+ybc09)C43&EqX;5V}PIOU1&K|BT72-3p{93u3$0t8;9-lnEs<pAdlKRK-
z7##Z)CQvHKyf$;{`#W^Ok^c+jC)&$VdPeCp?QoZK+Doba%SX@s|BbQ}4E7XH7ZcWO
zu?Hcwi4a?cgu!vqlgekUp4fCLok{c*X0r}5I2j!s?nzPv7eK(aqbFW0QBds<kNZLI
zLZqjic?gwy<W|e8JqTR;Jn9|=azC|3^HM;o>rH+Fo+fI#?v(>v*Ba?X(DjC$0EVX_
zowHHa5Q08Lq6D-?de=Pz$h59G<g1KDcEWy;B7wgyjw1JJ^Ta@^?Pk1Sr$1aTC(s&?
ztj()E4_uo=M!j@D)Bgldb;3AGUHF&1k|Vq9aXQ(MA`3&TYC(XK_QZc6Z;Z>reUb(8
zge0kis%!jgnLH1&=`V@A=Ut%Q5VD`*zbY-Xd;5U=0{_(yD|a<(llmi~S)0(mfc%L5
zFbdbiRpA({h_&9=W&SJV%JDuj)ACKK!@Qtmc{0<UXx)ec`nN%M3t;;mWF6G<4r;H=
zMCUx1+8v$9JO7H%Aff9Dgnn8*zt4g2YL6nPSiA1?%YC@#r?rXJ#x|{8GrZm2E#3|q
zhgzGq#n;hn__}?qn1c1qqPh8QkQj^##cSy46NYb3^Igr+j=k%3%(xAo5YY~)4U3bF
z_3fdiCb!L7%7$Qdxq^nEws*Z##n$zt7wY=@;Cc;JG&YnqbhNK&xh3!xp65qMXcf3#
zr+p-}df`hN`Xxm3AAbqqYuDEX(d0E?iZ*RK#Mic<lhl5n(~Xv(s8c(RU^Fe-LH64|
zhQ7A1)3(Yc!EdD8x!jb!>pz3#d0nL$V!*j>_Z$OP7ZjUn+HZYpk;|iDv{kEl;zIDu
zgkp43=c&xNTQgdNgCD>9;AX?;FP)k@uH)25`?$3I(R#K;I+wPCaWW%L;;)My<Tug;
zHEXCA@Z@jo)kaGLJ$eIdpkcfQ7(Wu4?g8%`vX1-e)zMQb7S!1$4$IUW94MBGmUGbA
zgXOFzor`Bm3CsyrW+t%QcAV$ov>=#rHe0K7o;-|;LNnUE?MBWv+Wk&G5HNBFjFtoU
zW5(5gNZF-7^L|~|(K<RJ_&^?@_fC0$mm_1V_6O4I=sqt=#_fuSW5cAslq;r_IV<6(
zDZNa@Q?Y_5rt$^5TEJnEs2M$sqryZwXTm3m54tK1Iokn$AZwR$)<DwAS%q{QBptmB
zIETpoK++#Lyu%+rH3P|9Y4>h_U<R;%H++rLW(#DjcsdtP74o@!sTe5b9!%#F0h>-j
zizv)mg`^dTWio+yK8Hb?vmI`+7z00J<&t)4pcjILd?|;XBvHMc0_`?rXNrMrK4E1B
z=po}kC}7Q5aUuOjz9^<2vh35dmPAZjSh-WPB34MAqAZRxU3svI71<>H%F$v<ju|Ty
zVyB&xS7`$(B&qmTE>US?21Rmd|NrI0vFH$7yMik*)MeVCpe806O_W(Cqr3`x#5Ha5
zDOFwr*nlP`7@aQLI>BhAt%)13b%br5EZdr5w9?j*8e2P+LcfBTV4q_Ap0crVg}DIY
z1euu4QyjnOJh$;n&T<Uf6oMs2CXUnmivE~_y#MCLa>G<{8MxcS?f8b;zj8airM4nv
z#T!5j%wqj|s7BJjbU0C$ZwR|C-&2kwD~%b5B44g6-&l?#tL2*n$D7rBuc#NN8MqbY
z_^w)hwHQ<WspdDA<H>6N8o|%Gn!i?DsB6b37VGj`%J;a{c3R7Edo{mJ@YcPW-(J3l
ztmb!!XPQElEQuQkHTWE$vd|Fg1jql?{96RwaJZ9!sJmW#YKShu_e_;KhQR1<t!HP0
z@Ym&c3%bs6C!>7pTwP)a+P%~!L)=!&1ji5?SK#|s;Pcy9U2KR=)eJ=$g7#F^=@pP3
z@!w0;Tx_OR;I9E+6Se-Qc|sR0*h`0iEhmcgh~Mkt2NmCa?lcU(CYJpy_bK3wOMdNm
zpiq_U^EnnsnI1!~)frcR1Ae{!J}>3lL`aQKp1uEh1^Jcy^K)sZL!6hf8wiyshOFl|
zDz4X_Yf`WFJ!(Y7TAguwt8`Rn&HEBieE0hml6Gp}msi2}i`wx|p?f?0d_m3A95Vh=
z@p&`P>jK5Je%XiWjKbu4{BiKxaZAkW6Km1QLpu`{kiyP-cD@UK8-9zbogSt0(hBld
zz;CV7-?zw)zM<xQ|FyKgK}6I%;GF8<yblf``E9QGVKeyTH{RRx(%(n&sJGu`XAu0>
zit*1~KBVMBF8O1M-|OOUll|%z$6W21k@jo*l?9*LN%!59<Q*PhQ1a`nFNmayc4=k?
zVVG$i7&>Z>M#m@cCBpIt2d14h<Mhb^K1DDSc{7>GPscK5!p;|pW~?+P=o&O**;XRB
zYv;~S*B3Hr7hB%n+J)0%rVz_oW}=kMo`y;VRlOA^(V>H}2dtrE$A;8>ZZr{$&&~?-
zzR~SF(z%&D-@jJxwD%ni9~?194h_?H6|!rL<DASKiI|Q+Wca8s_m3VP43C<J_w5@W
znJ_29gQFv)RI@3sV8|#~n8Tkv6h0Uof^d1$FE{Kr@?nRgA`O}J0Y)fP`31)R{@q7a
zG$uvseg=Xk7PDh2)G$qaJfb`z{g#R4k|hL3rt8)ly-5`FW-69T&_^cAU8DXZYKP1D
zA|0jE#iFV}mQYCN$ngERFfD3@kRO?dVDWSoF&m&=aA1lt1XosJRs?f-+X{vUqloR2
zib5b5lt-U|bV3A)O2vvP5lo!U!JY%{f>WZd<Esc0Wd$n}BL#)cW^56ZJqiL1Ci5U|
zYYw@5g26&wwmN8~)bLCtV1y}0)fv)`2IGGihZw4fWz%uAA#am;aL5HgI7<ZKn5>0B
zGMKQYOG)VAIpFnDBzckX*z|P4I>i)4!HmOajArtv$1$ZW$#?(5WVGI6>cN`qB;KF%
zcYC)gJ_C}iOteF3!k_hdAJ5;#`74Ei^Zd^2?LaE^dB1NfgPU1ULAQPc{098F{_}bW
zlt6qSfjs{wtkEIMw+w7&82oo}H^cfIKfJ6M+<&*eGA9lJp}Q#7zp95o(Y=)f@-US!
z>LdA%*B_48C=N65XMK)G=7@>J_gxO<*kGLC3H%YWmD|s8%O=%9+W%4F_H#W9A3z!1
zT`|q^O|R1TlQNRK{SH?M=jvg({Tv64DShrI*YEECmtlxR2bDp-e@oTTr#-7%|Eu7U
zU)lZ{PY9Gid?f)^?CPZ0W#$5~O8pB;|AG?qmn9vfRnZLp9CW4rqS9Yf`rKYN<EEc<
z>GQtqveJKrj8`PCmsx)eq_TeAk3Fp%!uOOc@9sa1FVg4dzt|X(qKiuZJCa(?{L84s
zzKy(6pTA?}?_2v^^6vUyaOrcr`3wy>q<0)p&fWg|AxeDstJkF*m-6?nZh1FNw>5}@
z%K99?{!CqfyXD>VzeAV$&-HVhd*uf8e+gZ3HS2Rc%-<vPcgM;d<?+ws;JmATj+?Jj
zr;yz3Wt!oyAXC}?a(t~S@RN~>#Ci;`qpVV&<8Z&y@2$`!l=YbXzT$EJIbI)A`fhou
zt1=m25YJr?>+}4>I;_u-zP)3dWeCoy`rU=hX4&qQ^wq&!H+M`axa-=wg8tb~M}cnc
zDwB&8cdnrCyT#F>c&{?KND)^0ZvVZ5i!4fKd17$LvOCz$sM1%(W$=iac*#GN$wi8L
zSJ1zpJ}|jlsSMohe{!?S#Zu#;PdJK}t3nqk`{U}ORGd`{|9SP|k-0ti^LXTOMe}RD
P`n{W_8BuT<ch~=W2uI@i

literal 0
HcmV?d00001

diff --git a/thirdparty/bmt/build/CMakeFiles/CMakeDirectoryInformation.cmake b/thirdparty/bmt/build/CMakeFiles/CMakeDirectoryInformation.cmake
new file mode 100644
index 0000000..c883c69
--- /dev/null
+++ b/thirdparty/bmt/build/CMakeFiles/CMakeDirectoryInformation.cmake
@@ -0,0 +1,16 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake
+
+# Relative path conversion top directories.
+set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/bemdeppi/ham/thirdparty/bmt")
+set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/bemdeppi/ham/thirdparty/bmt/build")
+
+# Force unix paths in dependencies.
+set(CMAKE_FORCE_UNIX_PATHS 1)
+
+
+# The C and CXX include file regular expressions for this directory.
+set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
+set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
+set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
+set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/thirdparty/bmt/build/CMakeFiles/CMakeOutput.log b/thirdparty/bmt/build/CMakeFiles/CMakeOutput.log
new file mode 100644
index 0000000..1f2cb96
--- /dev/null
+++ b/thirdparty/bmt/build/CMakeFiles/CMakeOutput.log
@@ -0,0 +1,339 @@
+The system is: Linux - 4.4.73-5.1_4.0.141-cray_ari_s - x86_64
+Compiling the CXX compiler identification source file "CMakeCXXCompilerId.cpp" succeeded.
+Compiler: /usr/bin/c++ 
+Build flags: 
+Id flags: 
+
+The output was:
+0
+
+
+Compilation of the CXX compiler identification source "CMakeCXXCompilerId.cpp" produced "a.out"
+
+The CXX compiler identification is GNU, found in "/home/bemdeppi/ham/thirdparty/bmt/build/CMakeFiles/3.5.2/CompilerIdCXX/a.out"
+
+Determining if the CXX compiler works passed with the following output:
+Change Dir: /home/bemdeppi/ham/thirdparty/bmt/build/CMakeFiles/CMakeTmp
+
+Run Build Command:"/usr/bin/gmake" "cmTC_671be/fast"
+/usr/bin/gmake -f CMakeFiles/cmTC_671be.dir/build.make CMakeFiles/cmTC_671be.dir/build
+gmake[1]: Entering directory '/home/bemdeppi/ham/thirdparty/bmt/build/CMakeFiles/CMakeTmp'
+Building CXX object CMakeFiles/cmTC_671be.dir/testCXXCompiler.cxx.o
+/usr/bin/c++      -o CMakeFiles/cmTC_671be.dir/testCXXCompiler.cxx.o -c /home/bemdeppi/ham/thirdparty/bmt/build/CMakeFiles/CMakeTmp/testCXXCompiler.cxx
+Linking CXX executable cmTC_671be
+/usr/bin/cmake -E cmake_link_script CMakeFiles/cmTC_671be.dir/link.txt --verbose=1
+/usr/bin/c++        CMakeFiles/cmTC_671be.dir/testCXXCompiler.cxx.o  -o cmTC_671be -rdynamic 
+gmake[1]: Leaving directory '/home/bemdeppi/ham/thirdparty/bmt/build/CMakeFiles/CMakeTmp'
+
+
+Detecting CXX compiler ABI info compiled with the following output:
+Change Dir: /home/bemdeppi/ham/thirdparty/bmt/build/CMakeFiles/CMakeTmp
+
+Run Build Command:"/usr/bin/gmake" "cmTC_d6736/fast"
+/usr/bin/gmake -f CMakeFiles/cmTC_d6736.dir/build.make CMakeFiles/cmTC_d6736.dir/build
+gmake[1]: Entering directory '/home/bemdeppi/ham/thirdparty/bmt/build/CMakeFiles/CMakeTmp'
+Building CXX object CMakeFiles/cmTC_d6736.dir/CMakeCXXCompilerABI.cpp.o
+/usr/bin/c++      -o CMakeFiles/cmTC_d6736.dir/CMakeCXXCompilerABI.cpp.o -c /usr/share/cmake/Modules/CMakeCXXCompilerABI.cpp
+Linking CXX executable cmTC_d6736
+/usr/bin/cmake -E cmake_link_script CMakeFiles/cmTC_d6736.dir/link.txt --verbose=1
+/usr/bin/c++       -v CMakeFiles/cmTC_d6736.dir/CMakeCXXCompilerABI.cpp.o  -o cmTC_d6736 -rdynamic  
+Using built-in specs.
+COLLECT_GCC=/usr/bin/c++
+COLLECT_LTO_WRAPPER=/usr/lib64/gcc/x86_64-suse-linux/4.8/lto-wrapper
+Target: x86_64-suse-linux
+Configured with: ../configure --prefix=/usr --infodir=/usr/share/info --mandir=/usr/share/man --libdir=/usr/lib64 --libexecdir=/usr/lib64 --enable-languages=c,c++,objc,fortran,obj-c++,java,ada --enable-checking=release --with-gxx-include-dir=/usr/include/c++/4.8 --enable-ssp --disable-libssp --disable-plugin --with-bugurl=http://bugs.opensuse.org/ --with-pkgversion='SUSE Linux' --disable-libgcj --disable-libmudflap --with-slibdir=/lib64 --with-system-zlib --enable-__cxa_atexit --enable-libstdcxx-allocator=new --disable-libstdcxx-pch --enable-version-specific-runtime-libs --enable-linker-build-id --enable-linux-futex --program-suffix=-4.8 --without-system-libunwind --with-arch-32=i586 --with-tune=generic --build=x86_64-suse-linux --host=x86_64-suse-linux
+Thread model: posix
+gcc version 4.8.5 (SUSE Linux) 
+COMPILER_PATH=/usr/lib64/gcc/x86_64-suse-linux/4.8/:/usr/lib64/gcc/x86_64-suse-linux/4.8/:/usr/lib64/gcc/x86_64-suse-linux/:/usr/lib64/gcc/x86_64-suse-linux/4.8/:/usr/lib64/gcc/x86_64-suse-linux/:/usr/lib64/gcc/x86_64-suse-linux/4.8/../../../../x86_64-suse-linux/bin/
+LIBRARY_PATH=/usr/lib64/gcc/x86_64-suse-linux/4.8/:/usr/lib64/gcc/x86_64-suse-linux/4.8/../../../../lib64/:/lib/../lib64/:/usr/lib/../lib64/:/usr/lib64/gcc/x86_64-suse-linux/4.8/../../../../x86_64-suse-linux/lib/:/usr/lib64/gcc/x86_64-suse-linux/4.8/../../../:/lib/:/usr/lib/
+COLLECT_GCC_OPTIONS='-v' '-o' 'cmTC_d6736' '-rdynamic' '-shared-libgcc' '-mtune=generic' '-march=x86-64'
+ /usr/lib64/gcc/x86_64-suse-linux/4.8/collect2 --build-id --eh-frame-hdr -m elf_x86_64 -export-dynamic -dynamic-linker /lib64/ld-linux-x86-64.so.2 -o cmTC_d6736 /usr/lib64/gcc/x86_64-suse-linux/4.8/../../../../lib64/crt1.o /usr/lib64/gcc/x86_64-suse-linux/4.8/../../../../lib64/crti.o /usr/lib64/gcc/x86_64-suse-linux/4.8/crtbegin.o -L/usr/lib64/gcc/x86_64-suse-linux/4.8 -L/usr/lib64/gcc/x86_64-suse-linux/4.8/../../../../lib64 -L/lib/../lib64 -L/usr/lib/../lib64 -L/usr/lib64/gcc/x86_64-suse-linux/4.8/../../../../x86_64-suse-linux/lib -L/usr/lib64/gcc/x86_64-suse-linux/4.8/../../.. CMakeFiles/cmTC_d6736.dir/CMakeCXXCompilerABI.cpp.o -lstdc++ -lm -lgcc_s -lgcc -lc -lgcc_s -lgcc /usr/lib64/gcc/x86_64-suse-linux/4.8/crtend.o /usr/lib64/gcc/x86_64-suse-linux/4.8/../../../../lib64/crtn.o
+gmake[1]: Leaving directory '/home/bemdeppi/ham/thirdparty/bmt/build/CMakeFiles/CMakeTmp'
+
+
+Parsed CXX implicit link information from above output:
+  link line regex: [^( *|.*[/\])(ld|([^/\]+-)?ld|collect2)[^/\]*( |$)]
+  ignore line: [Change Dir: /home/bemdeppi/ham/thirdparty/bmt/build/CMakeFiles/CMakeTmp]
+  ignore line: []
+  ignore line: [Run Build Command:"/usr/bin/gmake" "cmTC_d6736/fast"]
+  ignore line: [/usr/bin/gmake -f CMakeFiles/cmTC_d6736.dir/build.make CMakeFiles/cmTC_d6736.dir/build]
+  ignore line: [gmake[1]: Entering directory '/home/bemdeppi/ham/thirdparty/bmt/build/CMakeFiles/CMakeTmp']
+  ignore line: [Building CXX object CMakeFiles/cmTC_d6736.dir/CMakeCXXCompilerABI.cpp.o]
+  ignore line: [/usr/bin/c++      -o CMakeFiles/cmTC_d6736.dir/CMakeCXXCompilerABI.cpp.o -c /usr/share/cmake/Modules/CMakeCXXCompilerABI.cpp]
+  ignore line: [Linking CXX executable cmTC_d6736]
+  ignore line: [/usr/bin/cmake -E cmake_link_script CMakeFiles/cmTC_d6736.dir/link.txt --verbose=1]
+  ignore line: [/usr/bin/c++       -v CMakeFiles/cmTC_d6736.dir/CMakeCXXCompilerABI.cpp.o  -o cmTC_d6736 -rdynamic  ]
+  ignore line: [Using built-in specs.]
+  ignore line: [COLLECT_GCC=/usr/bin/c++]
+  ignore line: [COLLECT_LTO_WRAPPER=/usr/lib64/gcc/x86_64-suse-linux/4.8/lto-wrapper]
+  ignore line: [Target: x86_64-suse-linux]
+  ignore line: [Configured with: ../configure --prefix=/usr --infodir=/usr/share/info --mandir=/usr/share/man --libdir=/usr/lib64 --libexecdir=/usr/lib64 --enable-languages=c,c++,objc,fortran,obj-c++,java,ada --enable-checking=release --with-gxx-include-dir=/usr/include/c++/4.8 --enable-ssp --disable-libssp --disable-plugin --with-bugurl=http://bugs.opensuse.org/ --with-pkgversion='SUSE Linux' --disable-libgcj --disable-libmudflap --with-slibdir=/lib64 --with-system-zlib --enable-__cxa_atexit --enable-libstdcxx-allocator=new --disable-libstdcxx-pch --enable-version-specific-runtime-libs --enable-linker-build-id --enable-linux-futex --program-suffix=-4.8 --without-system-libunwind --with-arch-32=i586 --with-tune=generic --build=x86_64-suse-linux --host=x86_64-suse-linux]
+  ignore line: [Thread model: posix]
+  ignore line: [gcc version 4.8.5 (SUSE Linux) ]
+  ignore line: [COMPILER_PATH=/usr/lib64/gcc/x86_64-suse-linux/4.8/:/usr/lib64/gcc/x86_64-suse-linux/4.8/:/usr/lib64/gcc/x86_64-suse-linux/:/usr/lib64/gcc/x86_64-suse-linux/4.8/:/usr/lib64/gcc/x86_64-suse-linux/:/usr/lib64/gcc/x86_64-suse-linux/4.8/../../../../x86_64-suse-linux/bin/]
+  ignore line: [LIBRARY_PATH=/usr/lib64/gcc/x86_64-suse-linux/4.8/:/usr/lib64/gcc/x86_64-suse-linux/4.8/../../../../lib64/:/lib/../lib64/:/usr/lib/../lib64/:/usr/lib64/gcc/x86_64-suse-linux/4.8/../../../../x86_64-suse-linux/lib/:/usr/lib64/gcc/x86_64-suse-linux/4.8/../../../:/lib/:/usr/lib/]
+  ignore line: [COLLECT_GCC_OPTIONS='-v' '-o' 'cmTC_d6736' '-rdynamic' '-shared-libgcc' '-mtune=generic' '-march=x86-64']
+  link line: [ /usr/lib64/gcc/x86_64-suse-linux/4.8/collect2 --build-id --eh-frame-hdr -m elf_x86_64 -export-dynamic -dynamic-linker /lib64/ld-linux-x86-64.so.2 -o cmTC_d6736 /usr/lib64/gcc/x86_64-suse-linux/4.8/../../../../lib64/crt1.o /usr/lib64/gcc/x86_64-suse-linux/4.8/../../../../lib64/crti.o /usr/lib64/gcc/x86_64-suse-linux/4.8/crtbegin.o -L/usr/lib64/gcc/x86_64-suse-linux/4.8 -L/usr/lib64/gcc/x86_64-suse-linux/4.8/../../../../lib64 -L/lib/../lib64 -L/usr/lib/../lib64 -L/usr/lib64/gcc/x86_64-suse-linux/4.8/../../../../x86_64-suse-linux/lib -L/usr/lib64/gcc/x86_64-suse-linux/4.8/../../.. CMakeFiles/cmTC_d6736.dir/CMakeCXXCompilerABI.cpp.o -lstdc++ -lm -lgcc_s -lgcc -lc -lgcc_s -lgcc /usr/lib64/gcc/x86_64-suse-linux/4.8/crtend.o /usr/lib64/gcc/x86_64-suse-linux/4.8/../../../../lib64/crtn.o]
+    arg [/usr/lib64/gcc/x86_64-suse-linux/4.8/collect2] ==> ignore
+    arg [--build-id] ==> ignore
+    arg [--eh-frame-hdr] ==> ignore
+    arg [-m] ==> ignore
+    arg [elf_x86_64] ==> ignore
+    arg [-export-dynamic] ==> ignore
+    arg [-dynamic-linker] ==> ignore
+    arg [/lib64/ld-linux-x86-64.so.2] ==> ignore
+    arg [-o] ==> ignore
+    arg [cmTC_d6736] ==> ignore
+    arg [/usr/lib64/gcc/x86_64-suse-linux/4.8/../../../../lib64/crt1.o] ==> ignore
+    arg [/usr/lib64/gcc/x86_64-suse-linux/4.8/../../../../lib64/crti.o] ==> ignore
+    arg [/usr/lib64/gcc/x86_64-suse-linux/4.8/crtbegin.o] ==> ignore
+    arg [-L/usr/lib64/gcc/x86_64-suse-linux/4.8] ==> dir [/usr/lib64/gcc/x86_64-suse-linux/4.8]
+    arg [-L/usr/lib64/gcc/x86_64-suse-linux/4.8/../../../../lib64] ==> dir [/usr/lib64/gcc/x86_64-suse-linux/4.8/../../../../lib64]
+    arg [-L/lib/../lib64] ==> dir [/lib/../lib64]
+    arg [-L/usr/lib/../lib64] ==> dir [/usr/lib/../lib64]
+    arg [-L/usr/lib64/gcc/x86_64-suse-linux/4.8/../../../../x86_64-suse-linux/lib] ==> dir [/usr/lib64/gcc/x86_64-suse-linux/4.8/../../../../x86_64-suse-linux/lib]
+    arg [-L/usr/lib64/gcc/x86_64-suse-linux/4.8/../../..] ==> dir [/usr/lib64/gcc/x86_64-suse-linux/4.8/../../..]
+    arg [CMakeFiles/cmTC_d6736.dir/CMakeCXXCompilerABI.cpp.o] ==> ignore
+    arg [-lstdc++] ==> lib [stdc++]
+    arg [-lm] ==> lib [m]
+    arg [-lgcc_s] ==> lib [gcc_s]
+    arg [-lgcc] ==> lib [gcc]
+    arg [-lc] ==> lib [c]
+    arg [-lgcc_s] ==> lib [gcc_s]
+    arg [-lgcc] ==> lib [gcc]
+    arg [/usr/lib64/gcc/x86_64-suse-linux/4.8/crtend.o] ==> ignore
+    arg [/usr/lib64/gcc/x86_64-suse-linux/4.8/../../../../lib64/crtn.o] ==> ignore
+  remove lib [gcc_s]
+  remove lib [gcc]
+  remove lib [gcc_s]
+  remove lib [gcc]
+  collapse library dir [/usr/lib64/gcc/x86_64-suse-linux/4.8] ==> [/usr/lib64/gcc/x86_64-suse-linux/4.8]
+  collapse library dir [/usr/lib64/gcc/x86_64-suse-linux/4.8/../../../../lib64] ==> [/usr/lib64]
+  collapse library dir [/lib/../lib64] ==> [/lib64]
+  collapse library dir [/usr/lib/../lib64] ==> [/usr/lib64]
+  collapse library dir [/usr/lib64/gcc/x86_64-suse-linux/4.8/../../../../x86_64-suse-linux/lib] ==> [/usr/x86_64-suse-linux/lib]
+  collapse library dir [/usr/lib64/gcc/x86_64-suse-linux/4.8/../../..] ==> [/usr/lib64]
+  implicit libs: [stdc++;m;c]
+  implicit dirs: [/usr/lib64/gcc/x86_64-suse-linux/4.8;/usr/lib64;/lib64;/usr/x86_64-suse-linux/lib]
+  implicit fwks: []
+
+
+
+
+Detecting CXX [-std=c++1y] compiler features compiled with the following output:
+Change Dir: /home/bemdeppi/ham/thirdparty/bmt/build/CMakeFiles/CMakeTmp
+
+Run Build Command:"/usr/bin/gmake" "cmTC_a580f/fast"
+/usr/bin/gmake -f CMakeFiles/cmTC_a580f.dir/build.make CMakeFiles/cmTC_a580f.dir/build
+gmake[1]: Entering directory '/home/bemdeppi/ham/thirdparty/bmt/build/CMakeFiles/CMakeTmp'
+Building CXX object CMakeFiles/cmTC_a580f.dir/feature_tests.cxx.o
+/usr/bin/c++     -std=c++1y -o CMakeFiles/cmTC_a580f.dir/feature_tests.cxx.o -c /home/bemdeppi/ham/thirdparty/bmt/build/CMakeFiles/feature_tests.cxx
+Linking CXX executable cmTC_a580f
+/usr/bin/cmake -E cmake_link_script CMakeFiles/cmTC_a580f.dir/link.txt --verbose=1
+/usr/bin/c++        CMakeFiles/cmTC_a580f.dir/feature_tests.cxx.o  -o cmTC_a580f -rdynamic 
+gmake[1]: Leaving directory '/home/bemdeppi/ham/thirdparty/bmt/build/CMakeFiles/CMakeTmp'
+
+
+    Feature record: CXX_FEATURE:0cxx_aggregate_default_initializers
+    Feature record: CXX_FEATURE:1cxx_alias_templates
+    Feature record: CXX_FEATURE:1cxx_alignas
+    Feature record: CXX_FEATURE:1cxx_alignof
+    Feature record: CXX_FEATURE:1cxx_attributes
+    Feature record: CXX_FEATURE:0cxx_attribute_deprecated
+    Feature record: CXX_FEATURE:1cxx_auto_type
+    Feature record: CXX_FEATURE:0cxx_binary_literals
+    Feature record: CXX_FEATURE:1cxx_constexpr
+    Feature record: CXX_FEATURE:0cxx_contextual_conversions
+    Feature record: CXX_FEATURE:1cxx_decltype
+    Feature record: CXX_FEATURE:0cxx_decltype_auto
+    Feature record: CXX_FEATURE:1cxx_decltype_incomplete_return_types
+    Feature record: CXX_FEATURE:1cxx_default_function_template_args
+    Feature record: CXX_FEATURE:1cxx_defaulted_functions
+    Feature record: CXX_FEATURE:1cxx_defaulted_move_initializers
+    Feature record: CXX_FEATURE:1cxx_delegating_constructors
+    Feature record: CXX_FEATURE:1cxx_deleted_functions
+    Feature record: CXX_FEATURE:0cxx_digit_separators
+    Feature record: CXX_FEATURE:1cxx_enum_forward_declarations
+    Feature record: CXX_FEATURE:1cxx_explicit_conversions
+    Feature record: CXX_FEATURE:1cxx_extended_friend_declarations
+    Feature record: CXX_FEATURE:1cxx_extern_templates
+    Feature record: CXX_FEATURE:1cxx_final
+    Feature record: CXX_FEATURE:1cxx_func_identifier
+    Feature record: CXX_FEATURE:1cxx_generalized_initializers
+    Feature record: CXX_FEATURE:0cxx_generic_lambdas
+    Feature record: CXX_FEATURE:1cxx_inheriting_constructors
+    Feature record: CXX_FEATURE:1cxx_inline_namespaces
+    Feature record: CXX_FEATURE:1cxx_lambdas
+    Feature record: CXX_FEATURE:0cxx_lambda_init_captures
+    Feature record: CXX_FEATURE:1cxx_local_type_template_args
+    Feature record: CXX_FEATURE:1cxx_long_long_type
+    Feature record: CXX_FEATURE:1cxx_noexcept
+    Feature record: CXX_FEATURE:1cxx_nonstatic_member_init
+    Feature record: CXX_FEATURE:1cxx_nullptr
+    Feature record: CXX_FEATURE:1cxx_override
+    Feature record: CXX_FEATURE:1cxx_range_for
+    Feature record: CXX_FEATURE:1cxx_raw_string_literals
+    Feature record: CXX_FEATURE:1cxx_reference_qualified_functions
+    Feature record: CXX_FEATURE:0cxx_relaxed_constexpr
+    Feature record: CXX_FEATURE:0cxx_return_type_deduction
+    Feature record: CXX_FEATURE:1cxx_right_angle_brackets
+    Feature record: CXX_FEATURE:1cxx_rvalue_references
+    Feature record: CXX_FEATURE:1cxx_sizeof_member
+    Feature record: CXX_FEATURE:1cxx_static_assert
+    Feature record: CXX_FEATURE:1cxx_strong_enums
+    Feature record: CXX_FEATURE:1cxx_template_template_parameters
+    Feature record: CXX_FEATURE:1cxx_thread_local
+    Feature record: CXX_FEATURE:1cxx_trailing_return_types
+    Feature record: CXX_FEATURE:1cxx_unicode_literals
+    Feature record: CXX_FEATURE:1cxx_uniform_initialization
+    Feature record: CXX_FEATURE:1cxx_unrestricted_unions
+    Feature record: CXX_FEATURE:1cxx_user_literals
+    Feature record: CXX_FEATURE:0cxx_variable_templates
+    Feature record: CXX_FEATURE:1cxx_variadic_macros
+    Feature record: CXX_FEATURE:1cxx_variadic_templates
+
+
+Detecting CXX [-std=c++11] compiler features compiled with the following output:
+Change Dir: /home/bemdeppi/ham/thirdparty/bmt/build/CMakeFiles/CMakeTmp
+
+Run Build Command:"/usr/bin/gmake" "cmTC_83717/fast"
+/usr/bin/gmake -f CMakeFiles/cmTC_83717.dir/build.make CMakeFiles/cmTC_83717.dir/build
+gmake[1]: Entering directory '/home/bemdeppi/ham/thirdparty/bmt/build/CMakeFiles/CMakeTmp'
+Building CXX object CMakeFiles/cmTC_83717.dir/feature_tests.cxx.o
+/usr/bin/c++     -std=c++11 -o CMakeFiles/cmTC_83717.dir/feature_tests.cxx.o -c /home/bemdeppi/ham/thirdparty/bmt/build/CMakeFiles/feature_tests.cxx
+Linking CXX executable cmTC_83717
+/usr/bin/cmake -E cmake_link_script CMakeFiles/cmTC_83717.dir/link.txt --verbose=1
+/usr/bin/c++        CMakeFiles/cmTC_83717.dir/feature_tests.cxx.o  -o cmTC_83717 -rdynamic 
+gmake[1]: Leaving directory '/home/bemdeppi/ham/thirdparty/bmt/build/CMakeFiles/CMakeTmp'
+
+
+    Feature record: CXX_FEATURE:0cxx_aggregate_default_initializers
+    Feature record: CXX_FEATURE:1cxx_alias_templates
+    Feature record: CXX_FEATURE:1cxx_alignas
+    Feature record: CXX_FEATURE:1cxx_alignof
+    Feature record: CXX_FEATURE:1cxx_attributes
+    Feature record: CXX_FEATURE:0cxx_attribute_deprecated
+    Feature record: CXX_FEATURE:1cxx_auto_type
+    Feature record: CXX_FEATURE:0cxx_binary_literals
+    Feature record: CXX_FEATURE:1cxx_constexpr
+    Feature record: CXX_FEATURE:0cxx_contextual_conversions
+    Feature record: CXX_FEATURE:1cxx_decltype
+    Feature record: CXX_FEATURE:0cxx_decltype_auto
+    Feature record: CXX_FEATURE:1cxx_decltype_incomplete_return_types
+    Feature record: CXX_FEATURE:1cxx_default_function_template_args
+    Feature record: CXX_FEATURE:1cxx_defaulted_functions
+    Feature record: CXX_FEATURE:1cxx_defaulted_move_initializers
+    Feature record: CXX_FEATURE:1cxx_delegating_constructors
+    Feature record: CXX_FEATURE:1cxx_deleted_functions
+    Feature record: CXX_FEATURE:0cxx_digit_separators
+    Feature record: CXX_FEATURE:1cxx_enum_forward_declarations
+    Feature record: CXX_FEATURE:1cxx_explicit_conversions
+    Feature record: CXX_FEATURE:1cxx_extended_friend_declarations
+    Feature record: CXX_FEATURE:1cxx_extern_templates
+    Feature record: CXX_FEATURE:1cxx_final
+    Feature record: CXX_FEATURE:1cxx_func_identifier
+    Feature record: CXX_FEATURE:1cxx_generalized_initializers
+    Feature record: CXX_FEATURE:0cxx_generic_lambdas
+    Feature record: CXX_FEATURE:1cxx_inheriting_constructors
+    Feature record: CXX_FEATURE:1cxx_inline_namespaces
+    Feature record: CXX_FEATURE:1cxx_lambdas
+    Feature record: CXX_FEATURE:0cxx_lambda_init_captures
+    Feature record: CXX_FEATURE:1cxx_local_type_template_args
+    Feature record: CXX_FEATURE:1cxx_long_long_type
+    Feature record: CXX_FEATURE:1cxx_noexcept
+    Feature record: CXX_FEATURE:1cxx_nonstatic_member_init
+    Feature record: CXX_FEATURE:1cxx_nullptr
+    Feature record: CXX_FEATURE:1cxx_override
+    Feature record: CXX_FEATURE:1cxx_range_for
+    Feature record: CXX_FEATURE:1cxx_raw_string_literals
+    Feature record: CXX_FEATURE:1cxx_reference_qualified_functions
+    Feature record: CXX_FEATURE:0cxx_relaxed_constexpr
+    Feature record: CXX_FEATURE:0cxx_return_type_deduction
+    Feature record: CXX_FEATURE:1cxx_right_angle_brackets
+    Feature record: CXX_FEATURE:1cxx_rvalue_references
+    Feature record: CXX_FEATURE:1cxx_sizeof_member
+    Feature record: CXX_FEATURE:1cxx_static_assert
+    Feature record: CXX_FEATURE:1cxx_strong_enums
+    Feature record: CXX_FEATURE:1cxx_template_template_parameters
+    Feature record: CXX_FEATURE:1cxx_thread_local
+    Feature record: CXX_FEATURE:1cxx_trailing_return_types
+    Feature record: CXX_FEATURE:1cxx_unicode_literals
+    Feature record: CXX_FEATURE:1cxx_uniform_initialization
+    Feature record: CXX_FEATURE:1cxx_unrestricted_unions
+    Feature record: CXX_FEATURE:1cxx_user_literals
+    Feature record: CXX_FEATURE:0cxx_variable_templates
+    Feature record: CXX_FEATURE:1cxx_variadic_macros
+    Feature record: CXX_FEATURE:1cxx_variadic_templates
+
+
+Detecting CXX [-std=c++98] compiler features compiled with the following output:
+Change Dir: /home/bemdeppi/ham/thirdparty/bmt/build/CMakeFiles/CMakeTmp
+
+Run Build Command:"/usr/bin/gmake" "cmTC_1dbbe/fast"
+/usr/bin/gmake -f CMakeFiles/cmTC_1dbbe.dir/build.make CMakeFiles/cmTC_1dbbe.dir/build
+gmake[1]: Entering directory '/home/bemdeppi/ham/thirdparty/bmt/build/CMakeFiles/CMakeTmp'
+Building CXX object CMakeFiles/cmTC_1dbbe.dir/feature_tests.cxx.o
+/usr/bin/c++     -std=c++98 -o CMakeFiles/cmTC_1dbbe.dir/feature_tests.cxx.o -c /home/bemdeppi/ham/thirdparty/bmt/build/CMakeFiles/feature_tests.cxx
+Linking CXX executable cmTC_1dbbe
+/usr/bin/cmake -E cmake_link_script CMakeFiles/cmTC_1dbbe.dir/link.txt --verbose=1
+/usr/bin/c++        CMakeFiles/cmTC_1dbbe.dir/feature_tests.cxx.o  -o cmTC_1dbbe -rdynamic 
+gmake[1]: Leaving directory '/home/bemdeppi/ham/thirdparty/bmt/build/CMakeFiles/CMakeTmp'
+
+
+    Feature record: CXX_FEATURE:0cxx_aggregate_default_initializers
+    Feature record: CXX_FEATURE:0cxx_alias_templates
+    Feature record: CXX_FEATURE:0cxx_alignas
+    Feature record: CXX_FEATURE:0cxx_alignof
+    Feature record: CXX_FEATURE:0cxx_attributes
+    Feature record: CXX_FEATURE:0cxx_attribute_deprecated
+    Feature record: CXX_FEATURE:0cxx_auto_type
+    Feature record: CXX_FEATURE:0cxx_binary_literals
+    Feature record: CXX_FEATURE:0cxx_constexpr
+    Feature record: CXX_FEATURE:0cxx_contextual_conversions
+    Feature record: CXX_FEATURE:0cxx_decltype
+    Feature record: CXX_FEATURE:0cxx_decltype_auto
+    Feature record: CXX_FEATURE:0cxx_decltype_incomplete_return_types
+    Feature record: CXX_FEATURE:0cxx_default_function_template_args
+    Feature record: CXX_FEATURE:0cxx_defaulted_functions
+    Feature record: CXX_FEATURE:0cxx_defaulted_move_initializers
+    Feature record: CXX_FEATURE:0cxx_delegating_constructors
+    Feature record: CXX_FEATURE:0cxx_deleted_functions
+    Feature record: CXX_FEATURE:0cxx_digit_separators
+    Feature record: CXX_FEATURE:0cxx_enum_forward_declarations
+    Feature record: CXX_FEATURE:0cxx_explicit_conversions
+    Feature record: CXX_FEATURE:0cxx_extended_friend_declarations
+    Feature record: CXX_FEATURE:0cxx_extern_templates
+    Feature record: CXX_FEATURE:0cxx_final
+    Feature record: CXX_FEATURE:0cxx_func_identifier
+    Feature record: CXX_FEATURE:0cxx_generalized_initializers
+    Feature record: CXX_FEATURE:0cxx_generic_lambdas
+    Feature record: CXX_FEATURE:0cxx_inheriting_constructors
+    Feature record: CXX_FEATURE:0cxx_inline_namespaces
+    Feature record: CXX_FEATURE:0cxx_lambdas
+    Feature record: CXX_FEATURE:0cxx_lambda_init_captures
+    Feature record: CXX_FEATURE:0cxx_local_type_template_args
+    Feature record: CXX_FEATURE:0cxx_long_long_type
+    Feature record: CXX_FEATURE:0cxx_noexcept
+    Feature record: CXX_FEATURE:0cxx_nonstatic_member_init
+    Feature record: CXX_FEATURE:0cxx_nullptr
+    Feature record: CXX_FEATURE:0cxx_override
+    Feature record: CXX_FEATURE:0cxx_range_for
+    Feature record: CXX_FEATURE:0cxx_raw_string_literals
+    Feature record: CXX_FEATURE:0cxx_reference_qualified_functions
+    Feature record: CXX_FEATURE:0cxx_relaxed_constexpr
+    Feature record: CXX_FEATURE:0cxx_return_type_deduction
+    Feature record: CXX_FEATURE:0cxx_right_angle_brackets
+    Feature record: CXX_FEATURE:0cxx_rvalue_references
+    Feature record: CXX_FEATURE:0cxx_sizeof_member
+    Feature record: CXX_FEATURE:0cxx_static_assert
+    Feature record: CXX_FEATURE:0cxx_strong_enums
+    Feature record: CXX_FEATURE:1cxx_template_template_parameters
+    Feature record: CXX_FEATURE:0cxx_thread_local
+    Feature record: CXX_FEATURE:0cxx_trailing_return_types
+    Feature record: CXX_FEATURE:0cxx_unicode_literals
+    Feature record: CXX_FEATURE:0cxx_uniform_initialization
+    Feature record: CXX_FEATURE:0cxx_unrestricted_unions
+    Feature record: CXX_FEATURE:0cxx_user_literals
+    Feature record: CXX_FEATURE:0cxx_variable_templates
+    Feature record: CXX_FEATURE:0cxx_variadic_macros
+    Feature record: CXX_FEATURE:0cxx_variadic_templates
diff --git a/thirdparty/bmt/build/CMakeFiles/Makefile.cmake b/thirdparty/bmt/build/CMakeFiles/Makefile.cmake
new file mode 100644
index 0000000..dc1d5a8
--- /dev/null
+++ b/thirdparty/bmt/build/CMakeFiles/Makefile.cmake
@@ -0,0 +1,95 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake
+
+# The generator used is:
+set(CMAKE_DEPENDS_GENERATOR "Unix Makefiles")
+
+# The top level Makefile was generated from the following files:
+set(CMAKE_MAKEFILE_DEPENDS
+  "CMakeCache.txt"
+  "../CMakeLists.txt"
+  "CMakeFiles/3.5.2/CMakeCXXCompiler.cmake"
+  "CMakeFiles/3.5.2/CMakeSystem.cmake"
+  "CMakeFiles/feature_tests.cxx"
+  "../src/CMakeLists.txt"
+  "/usr/share/cmake/Modules/CMakeCXXCompiler.cmake.in"
+  "/usr/share/cmake/Modules/CMakeCXXCompilerABI.cpp"
+  "/usr/share/cmake/Modules/CMakeCXXInformation.cmake"
+  "/usr/share/cmake/Modules/CMakeCommonLanguageInclude.cmake"
+  "/usr/share/cmake/Modules/CMakeCompilerIdDetection.cmake"
+  "/usr/share/cmake/Modules/CMakeDetermineCXXCompiler.cmake"
+  "/usr/share/cmake/Modules/CMakeDetermineCompileFeatures.cmake"
+  "/usr/share/cmake/Modules/CMakeDetermineCompiler.cmake"
+  "/usr/share/cmake/Modules/CMakeDetermineCompilerABI.cmake"
+  "/usr/share/cmake/Modules/CMakeDetermineCompilerId.cmake"
+  "/usr/share/cmake/Modules/CMakeDetermineSystem.cmake"
+  "/usr/share/cmake/Modules/CMakeFindBinUtils.cmake"
+  "/usr/share/cmake/Modules/CMakeGenericSystem.cmake"
+  "/usr/share/cmake/Modules/CMakeLanguageInformation.cmake"
+  "/usr/share/cmake/Modules/CMakeParseArguments.cmake"
+  "/usr/share/cmake/Modules/CMakeParseImplicitLinkInfo.cmake"
+  "/usr/share/cmake/Modules/CMakeSystem.cmake.in"
+  "/usr/share/cmake/Modules/CMakeSystemSpecificInformation.cmake"
+  "/usr/share/cmake/Modules/CMakeSystemSpecificInitialize.cmake"
+  "/usr/share/cmake/Modules/CMakeTestCXXCompiler.cmake"
+  "/usr/share/cmake/Modules/CMakeTestCompilerCommon.cmake"
+  "/usr/share/cmake/Modules/CMakeUnixFindMake.cmake"
+  "/usr/share/cmake/Modules/Compiler/ADSP-DetermineCompiler.cmake"
+  "/usr/share/cmake/Modules/Compiler/ARMCC-DetermineCompiler.cmake"
+  "/usr/share/cmake/Modules/Compiler/AppleClang-DetermineCompiler.cmake"
+  "/usr/share/cmake/Modules/Compiler/Borland-DetermineCompiler.cmake"
+  "/usr/share/cmake/Modules/Compiler/Clang-DetermineCompiler.cmake"
+  "/usr/share/cmake/Modules/Compiler/Clang-DetermineCompilerInternal.cmake"
+  "/usr/share/cmake/Modules/Compiler/Comeau-CXX-DetermineCompiler.cmake"
+  "/usr/share/cmake/Modules/Compiler/Compaq-CXX-DetermineCompiler.cmake"
+  "/usr/share/cmake/Modules/Compiler/Cray-DetermineCompiler.cmake"
+  "/usr/share/cmake/Modules/Compiler/Embarcadero-DetermineCompiler.cmake"
+  "/usr/share/cmake/Modules/Compiler/Fujitsu-DetermineCompiler.cmake"
+  "/usr/share/cmake/Modules/Compiler/GHS-DetermineCompiler.cmake"
+  "/usr/share/cmake/Modules/Compiler/GNU-CXX-FeatureTests.cmake"
+  "/usr/share/cmake/Modules/Compiler/GNU-CXX.cmake"
+  "/usr/share/cmake/Modules/Compiler/GNU-DetermineCompiler.cmake"
+  "/usr/share/cmake/Modules/Compiler/GNU.cmake"
+  "/usr/share/cmake/Modules/Compiler/HP-CXX-DetermineCompiler.cmake"
+  "/usr/share/cmake/Modules/Compiler/IAR-DetermineCompiler.cmake"
+  "/usr/share/cmake/Modules/Compiler/IBMCPP-CXX-DetermineVersionInternal.cmake"
+  "/usr/share/cmake/Modules/Compiler/Intel-DetermineCompiler.cmake"
+  "/usr/share/cmake/Modules/Compiler/MIPSpro-DetermineCompiler.cmake"
+  "/usr/share/cmake/Modules/Compiler/MSVC-DetermineCompiler.cmake"
+  "/usr/share/cmake/Modules/Compiler/OpenWatcom-DetermineCompiler.cmake"
+  "/usr/share/cmake/Modules/Compiler/PGI-DetermineCompiler.cmake"
+  "/usr/share/cmake/Modules/Compiler/PathScale-DetermineCompiler.cmake"
+  "/usr/share/cmake/Modules/Compiler/SCO-DetermineCompiler.cmake"
+  "/usr/share/cmake/Modules/Compiler/SunPro-CXX-DetermineCompiler.cmake"
+  "/usr/share/cmake/Modules/Compiler/TI-DetermineCompiler.cmake"
+  "/usr/share/cmake/Modules/Compiler/VisualAge-CXX-DetermineCompiler.cmake"
+  "/usr/share/cmake/Modules/Compiler/Watcom-DetermineCompiler.cmake"
+  "/usr/share/cmake/Modules/Compiler/XL-CXX-DetermineCompiler.cmake"
+  "/usr/share/cmake/Modules/Compiler/zOS-CXX-DetermineCompiler.cmake"
+  "/usr/share/cmake/Modules/Internal/FeatureTesting.cmake"
+  "/usr/share/cmake/Modules/Platform/Linux-CXX.cmake"
+  "/usr/share/cmake/Modules/Platform/Linux-GNU-CXX.cmake"
+  "/usr/share/cmake/Modules/Platform/Linux-GNU.cmake"
+  "/usr/share/cmake/Modules/Platform/Linux.cmake"
+  "/usr/share/cmake/Modules/Platform/UnixPaths.cmake"
+  )
+
+# The corresponding makefile is:
+set(CMAKE_MAKEFILE_OUTPUTS
+  "Makefile"
+  "CMakeFiles/cmake.check_cache"
+  )
+
+# Byproducts of CMake generate step:
+set(CMAKE_MAKEFILE_PRODUCTS
+  "CMakeFiles/3.5.2/CMakeSystem.cmake"
+  "CMakeFiles/3.5.2/CMakeCXXCompiler.cmake"
+  "CMakeFiles/3.5.2/CMakeCXXCompiler.cmake"
+  "CMakeFiles/CMakeDirectoryInformation.cmake"
+  "src/CMakeFiles/CMakeDirectoryInformation.cmake"
+  )
+
+# Dependency information for all targets:
+set(CMAKE_DEPEND_INFO_FILES
+  "src/CMakeFiles/example.dir/DependInfo.cmake"
+  )
diff --git a/thirdparty/bmt/build/CMakeFiles/Makefile2 b/thirdparty/bmt/build/CMakeFiles/Makefile2
new file mode 100644
index 0000000..5fb0193
--- /dev/null
+++ b/thirdparty/bmt/build/CMakeFiles/Makefile2
@@ -0,0 +1,126 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake
+
+# Default target executed when no arguments are given to make.
+default_target: all
+
+.PHONY : default_target
+
+# The main recursive all target
+all:
+
+.PHONY : all
+
+# The main recursive preinstall target
+preinstall:
+
+.PHONY : preinstall
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+
+# Remove some rules from gmake that .SUFFIXES does not remove.
+SUFFIXES =
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+
+# Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+
+# A target that is always out of date.
+cmake_force:
+
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E remove -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/bemdeppi/ham/thirdparty/bmt
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/bemdeppi/ham/thirdparty/bmt/build
+
+#=============================================================================
+# Directory level rules for directory src
+
+# Convenience name for "all" pass in the directory.
+src/all: src/CMakeFiles/example.dir/all
+
+.PHONY : src/all
+
+# Convenience name for "clean" pass in the directory.
+src/clean: src/CMakeFiles/example.dir/clean
+
+.PHONY : src/clean
+
+# Convenience name for "preinstall" pass in the directory.
+src/preinstall:
+
+.PHONY : src/preinstall
+
+#=============================================================================
+# Target rules for target src/CMakeFiles/example.dir
+
+# All Build rule for target.
+src/CMakeFiles/example.dir/all:
+	$(MAKE) -f src/CMakeFiles/example.dir/build.make src/CMakeFiles/example.dir/depend
+	$(MAKE) -f src/CMakeFiles/example.dir/build.make src/CMakeFiles/example.dir/build
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/bemdeppi/ham/thirdparty/bmt/build/CMakeFiles --progress-num=1,2 "Built target example"
+.PHONY : src/CMakeFiles/example.dir/all
+
+# Include target in all.
+all: src/CMakeFiles/example.dir/all
+
+.PHONY : all
+
+# Build rule for subdir invocation for target.
+src/CMakeFiles/example.dir/rule: cmake_check_build_system
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/bemdeppi/ham/thirdparty/bmt/build/CMakeFiles 2
+	$(MAKE) -f CMakeFiles/Makefile2 src/CMakeFiles/example.dir/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/bemdeppi/ham/thirdparty/bmt/build/CMakeFiles 0
+.PHONY : src/CMakeFiles/example.dir/rule
+
+# Convenience name for target.
+example: src/CMakeFiles/example.dir/rule
+
+.PHONY : example
+
+# clean rule for target.
+src/CMakeFiles/example.dir/clean:
+	$(MAKE) -f src/CMakeFiles/example.dir/build.make src/CMakeFiles/example.dir/clean
+.PHONY : src/CMakeFiles/example.dir/clean
+
+# clean rule for target.
+clean: src/CMakeFiles/example.dir/clean
+
+.PHONY : clean
+
+#=============================================================================
+# Special targets to cleanup operation of make.
+
+# Special rule to run CMake to check the build system integrity.
+# No rule that depends on this can have commands that come from listfiles
+# because they might be regenerated.
+cmake_check_build_system:
+	$(CMAKE_COMMAND) -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
+.PHONY : cmake_check_build_system
+
diff --git a/thirdparty/bmt/build/CMakeFiles/TargetDirectories.txt b/thirdparty/bmt/build/CMakeFiles/TargetDirectories.txt
new file mode 100644
index 0000000..ba137ee
--- /dev/null
+++ b/thirdparty/bmt/build/CMakeFiles/TargetDirectories.txt
@@ -0,0 +1,5 @@
+/home/bemdeppi/ham/thirdparty/bmt/build/CMakeFiles/edit_cache.dir
+/home/bemdeppi/ham/thirdparty/bmt/build/CMakeFiles/rebuild_cache.dir
+/home/bemdeppi/ham/thirdparty/bmt/build/src/CMakeFiles/edit_cache.dir
+/home/bemdeppi/ham/thirdparty/bmt/build/src/CMakeFiles/rebuild_cache.dir
+/home/bemdeppi/ham/thirdparty/bmt/build/src/CMakeFiles/example.dir
diff --git a/thirdparty/bmt/build/CMakeFiles/cmake.check_cache b/thirdparty/bmt/build/CMakeFiles/cmake.check_cache
new file mode 100644
index 0000000..3dccd73
--- /dev/null
+++ b/thirdparty/bmt/build/CMakeFiles/cmake.check_cache
@@ -0,0 +1 @@
+# This file is generated by cmake for dependency checking of the CMakeCache.txt file
diff --git a/thirdparty/bmt/build/CMakeFiles/feature_tests.bin b/thirdparty/bmt/build/CMakeFiles/feature_tests.bin
new file mode 100755
index 0000000000000000000000000000000000000000..a9390051b36d43414e9fd29c5560d4b5d350d9db
GIT binary patch
literal 16600
zcmeHOeQ+GbmG9Zn$J(-2AGV2XCwL)tOl-80jWNPtdnH+BMPzJj36p>sk9J493%j#o
zcGi}F1Sbv@I*yz#MFm$95?5Va-A8d%T-`@}sZ=nSg78Nw&Ic5C$yNCv<O~T19N=@+
z0lMGo>9^XQowd!S?vJaeRWt9S-+TS~bx+Uq_S5}?!`nleCb)#e7C}^7b6m2#yAYk%
zNK)~7#XJ!gtHnyu1WqHaxU3=QVGPeH(+tB(P6uqj)f@5{y`i{77=F@Y3x@0=L27Td
z#1-nnXWKoJp%78YAU*Q+)Tbl~xL+a+_bWd#3bG!Nq<2Q?ol$xW+mt**?jQAyt`Su~
z5{iQ&I!-`rllNIjvQ&1x=B5Zg+9pZBxI`Fk@YsSO>)i}J^7CSgGB0+k_Ugr@NBPf?
z$Duoy9ox8}JD2LpW$n^r*JSU;u8kXF#X@Yo>^Ipb*&A*el`heEb0MD><02l_ui1O=
zqN#V*zO;V4Y5kIC?)_Cy^6&4VVeG&~g=1Ph?w&r$wc?84T8N8mH{+sMu&+R6nXe%E
zM%|i+t3~nrV1b{ZX!4nClRmX`r0q~xT()>#xS?U;Lk%KMb!3;?xO3jXOW%L>>-P-|
zzWBQZn;vUfKlaNvpZMIXC;sKfuYTv{yKnvEo%et9zJGe;oyda4uYBjXKYxq5P`8l1
zXh7|XI{b&h=lE5sT~$YZXC3|zz;6@FL^Qe|*lc+XXq#9rdVTyiB!8Lc@$r8Mel)<&
zGvIe%1bMw#1DeX;;A+R;v2dJ7Ij#g>z(w*=#kYuw^3S-)AAh??!6*3-o8!Plc|G|#
z@D8y+9B+yfyIdXzM)F6LeKq~XUr2tNcvSJr3V05RZYsHUEzSLn!Z6bLf^8IC({T+$
z7&jg;c3bIe(Y2g`oLMYdMPUq$>@ZT6qu3)m266@48ZpOmmRA>8gLd*>6Ts5Rq){Yl
zodA{OrU_R~=gq7QXCx<CEE(fjI}7VOQJu=#MyY6}P@OVe(^IIFg{TX9tXSka%Z9W?
z(&C2Uq5c75eQbT~YL0;yBMr}6(4QwoHD3lw^O#C$Sr(5;NW9~b;@pb;67xT|ooXJ@
zv!*Lq0*S-F3nw0Kyn1O=Bpx{7HlCr0ojCk#_+*6;(3z_*hseyTmAJRAhyx=!Lqe~g
zo<UjlpPf-LeuTBwy*V`f(>oH=uOtrt`t4mKL(iX}nUHw?X_|0}=bxu}bmn`|dvCmT
z#Wo?1Ujqk66J0d#5{J)464S384DCC4oa~Wf@9levs%B2Nt~f-z<3zFi)8}-^C#IiC
zJpIPjgmx<N({m*#f1d;s5Bv<uiRtG&{a2L!q0Lk9SCraEp?T&@aAe<ejpJZx@1CUo
zB@S;6UjhO5(y|MSN1r;4w-)4QPVIaBUZ}LL5U&>j$;R{-KS)fM-hO!SgT&F+e>`*o
z-E95giG4732wsT3y?xU&eYXrvpY0nRntriwWcZO)N0&s=g-5yw?iv2<YwLcQc=|(~
zM&yk3(acQZfj8YtTUUIQ)Wz`h8^hCQx538D;$I~WpVSi9{<ic64b*M>`fl&r*LO#s
zaq`G`jOE`wiLTSws-e{6N-IqT-sTV7daJR0uy15^_u!_U<m9Abrqg(UOxMCQHg1-3
zu0iuTYv!`|SWYoOK9@C%hHK>~a!?8uNZV$;io$q+N!N9<W2IngTnG0i94m=7QvnuA
zZozP;Cal`3v8-)6Q+Tqmq?ow?w~_@L%gyA3Q>%+A)VL)xX8=8bcw`}5E0?m8x!~5h
z#xUVweMQ7PSwKWB41;62CC8S%32e$6r16rSbm2{TBn;C@*O#|a25X#GC|@{WU0?#F
ztQ^gktevKy6kVs3bPGW<g#<BL=WRHf&bmg?nlK$R*obA9^2T_<IcPd5Oo3#MB!fo*
zW0=b(Veh<Sh4IAlOl~<@AQ!ZWX)_z-fY{+pVdiQ_mZEKBQ<m*!$Fo+z+)7(EO-MXt
zsS5;LN@kNr&diUcf*!1_ov|D|vlkqZtc|Zv%dpM7Rh%%BK~HbpM!X8?s*yA&Fh>K`
z3rRdI^ekLxV&Wy9Hsme%$*~L8WYU^&YeQ?(tivQo8hI-}W;xQQ0I^aoH{k|E9#h<L
zFf_F)j%lYYnr%T<2k~}yD8}`d62}_HcZ!{~jJvUB(7>N}_B&S2oP>ycjDlC7ip2mk
zHHAkFb3eelY&zo_@IGf5V~&};%W?yzs&l~1m1u!cUIvI1F&_)#DzLTfsJPRInG-NF
z2oQ}bJ+sxe*U_7X-x;$AJ@$F55kaemn{g~Ng&BegRqKfBm|4u_^aWPSlATQ!Qr3Bf
z3=$ac{OltrSC`rbOE%^^#y?3b8k7Rx1NcmI&ePTb)5+3jYdHYbL6WayDU+hPZ6=+<
z`2;we#rb&CYs>ZPXph^7D{*#a<`Cc{;4VPxzh-9s67V6w9|4{NY}0_C(7uw2cF%61
zO}1&v7c_^DXw7ZJr{5Byu(5^&R0)<B_+`VBKk@#|Oq66JZQCQAH@3DM3?C9(mtD8!
zs+E@$n{47UQ2Ybz5-$28Z4ZV97B=lg!>TR?cpUX|uhJiB`;*XMr1NX~U?loT!(il!
zKW<D!dLC>_M0yW5?})^6k>0*YPhaGU{z$Yx(h2$gNLcpk3D`LeJG>_WIalT)Fc*Qj
z2+T!bE&_8An2W$%1m+?z7lA+E2%J*q98W69p-|f+{0i?ChFKq7^wD2dG@pm@c?q4h
zQQ`BDr5;&6S6!}X+5=Hp<^l1tIzN?rCGfbuLDe4(#U=LsM>7RNdUTJbeIS*1V_ZTy
zC!lgf)zhAl%7-2(pX+rhdWnJzbVl%VJQdzku{`e!>D+<JCp;jIHN_>y{D&0H{a}4M
z1Ej(<eixVh$tZdJ`Yvf6hqo2IMb&%S|Mz!3$M&~$wvHb+3=C|FUa@C%&tP<z4t_ry
z-4N@YJqi+6!jKnBFCzYPJ%2ki4??1ZJ4diu;(A&CZn>W4ACD)`zlLhHGUfSm8^Y|_
z=W|1+EK!Yk?fo6P;K~13#ewJFQJjxZVOkOux4VI>{>!1|{%=yxV=UHPoGRj{|6(_e
z%ncl&%cH{B9?_jC<gIRVtdz~Ax}Ax<hxCsQ4R7mC(>Y!j4&Byw<1mj7&7*yLqM$e8
z1W-G44;pQdKM{nrgTS@>(DpEp+sPX(Ob5ZCP;-<(Lo>M^3d;!`(wb<ZheC~Xe5N-v
zW^EUBjdAEhWQc&)M9*@=X56&SN!;h@$2bT#fD|cQA+a0xuy&XjD7D>8I&Sv&>*WZ%
zX^^djwI2Z2CUK))`p5L&!Bd^kcT*RBu~l+pmm9oJHfG4e0IOOMprU>IRot7BvU0m*
zfxJVKR71@*eZEW{1=;)!B5yehREFh6AoH?9+q(hCuW`-Kt8lewqoMmni#8Ja67DyL
zzKF^NdsO8vSP=`uugm*exU0ZN$V}@K)P~_dl-9o8wgMG&wL^CmV8<rh7Ln(R$X|IA
ziw}dTUAY+d#or<{M(7fU&}-EA-3=nFJ%Br%g$9nF{riod)-Kwb+O-Z%4=)d|3NNC0
zsI_aWB8yt|$nr=Vme|lr(bBR7BqpOm@fbF25PD=&%XKY7i?%KeVMT5Hj2K!ZwPA6z
zDcBxbYWCZ_q-+RQXB%jYX<L_iO{`u@dhyQ3`j={GqN%a05rTisvz{PY7|&0T&^&O%
zo_{1XfBGvL`i0z*Kl%!eFFXIVA1*HdQ?zUAAil5_oh1JwUN>4}BA|5~m22A2BC_9p
z8T#6>L|ZLgLflAs@vJLbmtK$8?~+P0#6WQU;kgyukm#X0?Y|=nahIuI@KviBaUpnR
zLNz)W;-yUAqUmk1{!ee&u~LsjO9v)z3*mHM`?R$EsUX{ePGDn6G9&im3W;vqFUAG6
zYA6qQ@z=L%!==sLp+?w%WBd-#e<HL{19;z*ZG0i9jqXy>QMpS<qEKsaR}nvSdx=cl
zc%60Q@G@6QVNI}d<6X19BXt5t%CU_15j8Kwi-&$vXnIGuL$_VMBkJ8Zcj@+xdh3n1
zW5o@<K)2_=9)2tof_M8qZUpGLQxV|Tk-l2{oQyiU&#xqXt>V!kRJ3FlvuWE(MQKSd
z6UmI}h+?MTxYYt4i$r}nA|LauG~j&>M7#2C$+kADlRl89lgw=PlEk82>1fx^_0cZG
zb934*U3+b`YaFomTEv>p^FW%#DLa|L;Y*=Z>?+v@vv#V>rK5%-D)W|;wz|w*u1g*?
znYQb3i$xRsoMoro%;p{lI)##rY?m~ecQc^<hTL4SE05!p+-8*`&}B_pNg?A$v7o~v
zI?R)Z;R?$>Fd=X(eSmJsnv|J}*<yI-il*b3Q!0suj6sE3!qGPIWmXN2SCF(*PSK%P
z`~R;X?iw0EXzPR$Q(czz3Tk4MQTlA4q6vdhehW8<ceT;)deyaeHlT?SM#q$tChlOg
z($>gD*t(f*?JwKf$7rRkn`>-cr4)J<!~*+W#&0Pb+oLcaz&=10X7duq4?3@H{E)L8
zL&|@of>(?zybp^i`Z5LAD9FyTV~Sk{et+1HpEbp$4Eyo8`fMpHUIe0Jtb0T88c7{5
zgpokLF5EzVgE$(<Z!GT@D;v>~vB~?yYJPKhKUvKWi=cg&j?7Npzg5f67rYOw=C_ph
zi`D$vyhEzjk->Ue#kpD@-$TvuN|wa+ioD6nN*!5tJa4M`v-y<O{El*-aW#LDIMo)f
zWJz4dR)h1VD=T%eq`bec=3gQx1I=G_?5czIpE_nU=L1&i=mN7l8f2$7Kb7?KfP7Go
zPrJ_Aq6_+ZsV%y=w3Z2uE>_gxN9*wUX{@f+#pTruMd^aRsjAD5KsJbXW~#ZgiU#u+
zfUk+#_|rNO!u*N*<`wPrh(F=u#}xlbAAg&)Kbv<ezf-`Y<xUHj542S!`=@+%9+dL6
z^XkvR59;stq<p(Lq1K_7<mOI@XY0t<i_cG_okikAdmPwo`B$JpaU+Q!e`w!EnZul?
zFG<`!z@-Ju+YUIS;a4$#zVfAte}!+Jua$vaT=RTgulWAwai_FX`+U6veo!9vt>6dc
zJ-@H`{PEB45!yaSxsw&}U>*LLv{Sn-JuLYZ1{C_&b?iI`eo$Wci*@AR0zW8!{R8lW
z^2it9$D5!$@@u4>+I;s;DPKE|E5HxRJ8zQwfb}0couXSBAICPyFm4>!Z43|X89@g5
zY@)c~=8Yu2O(})kNEM89t}uq&v=rqTA{}#5aK>dSw(+W~t_c>BiQwh$PuH0e<CL;w
zq)Pex6jUmx>Q7ZEYMGQKhjgE!TZ|zEN*G@l?j85I#me21VQk;sw`0&4ylER{!OIRY
zj!#@;Frfqz+ja}%hT)z4eZ$7i?c4VZju<0-{lkN#gv|J|fmE?zWK26nIr7zxQ6786
zN44Cv3S-;nZ|d7IG=RG5@9T@n&-YZMA!IFO?RdO0>HYla`_&2dl<6Os&Hph{?`77#
zOy?J|;#3~HAVAmgV1_YlS1e~j#O#7=#rpb(uv<zi3R|m~{4udBn-VdiGG;L&VyP(`
z_B`l1UX4l#t|AQ7QDz@0C~P9<iWmi5#DK=q1t64IEMoHi3}Q|}dL6SeYC>gFFv66l
z>P<pVgYi3)5JNL&KAVIO1((c&L;rXX!~Y|Q80L5$IjG=SW2H3oD8mtakIdTR1;)*>
zF~>T<6zwY+N6Z*SHXftOQteXozv9q)6iWu)bzb4`UjBbK{}%|!Ru+2m(<_bj`TLmv
zpDbD_47|ZduLE7F&)?CmGPu$w@7GVDz7ZF<&)?fe72{E*-=v`TYx%4n+rUSvT7mU@
z^thDl(LDwFWtxg#-{T5#6A=2}EY|1m_ajRGNfj{aBca@WCvYmXzadf9=X0YtbqeJ}
z50vlz_}_sV?1jqy^L}89a+u@C{&PDF?}9#^`7q7<hKy2(s+Rr!E4J`{(x3I8_ZJT-
zeeNf>@9+OtVTi`GUl}~s6qnes0Dbxv^y_~OJc<e1=Y8pM#aJe-&EBZ3$}$b2QvbAS
z|FqJNRtf`eQ#3=GUzPgh{SO5Uh5hxYTKaKRRO<8hOPkWiud!8y+hyjT`Sf`|^{8r?
z&-Yl~-~Yc-`aJ%})OYGJrGHlW!TM1~6#QF2a*_4<yORGebc0Xc-~LlRecs2NQXPHI
zr|*yF3#iAgQ$9l^%HNm#f1-YQKTW3~trhxw?)$RZV)^C$^uIxu`mY}M^8WGkMe6?s
zx-`{TpZA&kf4uzvx;zZrKF@=rzV>-PdQNTmSRP`sFwKxoa4P*T?^~muyaFD9JL@q#
z2VSK<|NmuFDYBnzfc2RE2ng}G|GdxWiFy)#d1|Y&=wJ}9r;<r@)MN1a!_}<MaDme2
zdWO5A3S&V<`unrIp8jQ?!gB6dusN~1j{aRMJq0=ut1LcJTwO>1KUR5Kzws&fNYSVC
z{qg(7YES=Rez27jtj};*=~wzlxS=*+6ralCBgNi2`iItfTBj<Nf&1g%4xTT>QsarM
wJw?mcppTSu$Ja%vxa&$!zgInYp8<J2E}oA(uV{S@YX2ei1LjJfaew>&0~+p{aR2}S

literal 0
HcmV?d00001

diff --git a/thirdparty/bmt/build/CMakeFiles/feature_tests.cxx b/thirdparty/bmt/build/CMakeFiles/feature_tests.cxx
new file mode 100644
index 0000000..b93418c
--- /dev/null
+++ b/thirdparty/bmt/build/CMakeFiles/feature_tests.cxx
@@ -0,0 +1,405 @@
+
+  const char features[] = {"\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 500 && __cplusplus >= 201402L
+"1"
+#else
+"0"
+#endif
+"cxx_aggregate_default_initializers\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 407 && __cplusplus >= 201103L
+"1"
+#else
+"0"
+#endif
+"cxx_alias_templates\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 408 && __cplusplus >= 201103L
+"1"
+#else
+"0"
+#endif
+"cxx_alignas\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 408 && __cplusplus >= 201103L
+"1"
+#else
+"0"
+#endif
+"cxx_alignof\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 408 && __cplusplus >= 201103L
+"1"
+#else
+"0"
+#endif
+"cxx_attributes\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 409 && __cplusplus > 201103L
+"1"
+#else
+"0"
+#endif
+"cxx_attribute_deprecated\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
+"1"
+#else
+"0"
+#endif
+"cxx_auto_type\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 409 && __cplusplus > 201103L
+"1"
+#else
+"0"
+#endif
+"cxx_binary_literals\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 406 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
+"1"
+#else
+"0"
+#endif
+"cxx_constexpr\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 409 && __cplusplus > 201103L
+"1"
+#else
+"0"
+#endif
+"cxx_contextual_conversions\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
+"1"
+#else
+"0"
+#endif
+"cxx_decltype\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 409 && __cplusplus > 201103L
+"1"
+#else
+"0"
+#endif
+"cxx_decltype_auto\n"
+"CXX_FEATURE:"
+#if ((__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) >= 40801) && __cplusplus >= 201103L
+"1"
+#else
+"0"
+#endif
+"cxx_decltype_incomplete_return_types\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
+"1"
+#else
+"0"
+#endif
+"cxx_default_function_template_args\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
+"1"
+#else
+"0"
+#endif
+"cxx_defaulted_functions\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 406 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
+"1"
+#else
+"0"
+#endif
+"cxx_defaulted_move_initializers\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 407 && __cplusplus >= 201103L
+"1"
+#else
+"0"
+#endif
+"cxx_delegating_constructors\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
+"1"
+#else
+"0"
+#endif
+"cxx_deleted_functions\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 409 && __cplusplus > 201103L
+"1"
+#else
+"0"
+#endif
+"cxx_digit_separators\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 406 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
+"1"
+#else
+"0"
+#endif
+"cxx_enum_forward_declarations\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 405 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
+"1"
+#else
+"0"
+#endif
+"cxx_explicit_conversions\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 407 && __cplusplus >= 201103L
+"1"
+#else
+"0"
+#endif
+"cxx_extended_friend_declarations\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
+"1"
+#else
+"0"
+#endif
+"cxx_extern_templates\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 407 && __cplusplus >= 201103L
+"1"
+#else
+"0"
+#endif
+"cxx_final\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
+"1"
+#else
+"0"
+#endif
+"cxx_func_identifier\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
+"1"
+#else
+"0"
+#endif
+"cxx_generalized_initializers\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 409 && __cplusplus > 201103L
+"1"
+#else
+"0"
+#endif
+"cxx_generic_lambdas\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 408 && __cplusplus >= 201103L
+"1"
+#else
+"0"
+#endif
+"cxx_inheriting_constructors\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
+"1"
+#else
+"0"
+#endif
+"cxx_inline_namespaces\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 405 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
+"1"
+#else
+"0"
+#endif
+"cxx_lambdas\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 409 && __cplusplus > 201103L
+"1"
+#else
+"0"
+#endif
+"cxx_lambda_init_captures\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 405 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
+"1"
+#else
+"0"
+#endif
+"cxx_local_type_template_args\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
+"1"
+#else
+"0"
+#endif
+"cxx_long_long_type\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 406 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
+"1"
+#else
+"0"
+#endif
+"cxx_noexcept\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 407 && __cplusplus >= 201103L
+"1"
+#else
+"0"
+#endif
+"cxx_nonstatic_member_init\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 406 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
+"1"
+#else
+"0"
+#endif
+"cxx_nullptr\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 407 && __cplusplus >= 201103L
+"1"
+#else
+"0"
+#endif
+"cxx_override\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 406 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
+"1"
+#else
+"0"
+#endif
+"cxx_range_for\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 405 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
+"1"
+#else
+"0"
+#endif
+"cxx_raw_string_literals\n"
+"CXX_FEATURE:"
+#if ((__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) >= 40801) && __cplusplus >= 201103L
+"1"
+#else
+"0"
+#endif
+"cxx_reference_qualified_functions\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 500 && __cplusplus >= 201402L
+"1"
+#else
+"0"
+#endif
+"cxx_relaxed_constexpr\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 409 && __cplusplus > 201103L
+"1"
+#else
+"0"
+#endif
+"cxx_return_type_deduction\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
+"1"
+#else
+"0"
+#endif
+"cxx_right_angle_brackets\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
+"1"
+#else
+"0"
+#endif
+"cxx_rvalue_references\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
+"1"
+#else
+"0"
+#endif
+"cxx_sizeof_member\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
+"1"
+#else
+"0"
+#endif
+"cxx_static_assert\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
+"1"
+#else
+"0"
+#endif
+"cxx_strong_enums\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && __cplusplus
+"1"
+#else
+"0"
+#endif
+"cxx_template_template_parameters\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 408 && __cplusplus >= 201103L
+"1"
+#else
+"0"
+#endif
+"cxx_thread_local\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
+"1"
+#else
+"0"
+#endif
+"cxx_trailing_return_types\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
+"1"
+#else
+"0"
+#endif
+"cxx_unicode_literals\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
+"1"
+#else
+"0"
+#endif
+"cxx_uniform_initialization\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 406 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
+"1"
+#else
+"0"
+#endif
+"cxx_unrestricted_unions\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 407 && __cplusplus >= 201103L
+"1"
+#else
+"0"
+#endif
+"cxx_user_literals\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 500 && __cplusplus >= 201402L
+"1"
+#else
+"0"
+#endif
+"cxx_variable_templates\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
+"1"
+#else
+"0"
+#endif
+"cxx_variadic_macros\n"
+"CXX_FEATURE:"
+#if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && (__cplusplus >= 201103L || (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GXX_EXPERIMENTAL_CXX0X__))
+"1"
+#else
+"0"
+#endif
+"cxx_variadic_templates\n"
+
+};
+
+int main(int argc, char** argv) { (void)argv; return features[argc]; }
diff --git a/thirdparty/bmt/build/CMakeFiles/progress.marks b/thirdparty/bmt/build/CMakeFiles/progress.marks
new file mode 100644
index 0000000..0cfbf08
--- /dev/null
+++ b/thirdparty/bmt/build/CMakeFiles/progress.marks
@@ -0,0 +1 @@
+2
diff --git a/thirdparty/bmt/build/Makefile b/thirdparty/bmt/build/Makefile
new file mode 100644
index 0000000..c178b32
--- /dev/null
+++ b/thirdparty/bmt/build/Makefile
@@ -0,0 +1,148 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake
+
+# Default target executed when no arguments are given to make.
+default_target: all
+
+.PHONY : default_target
+
+# Allow only one "make -f Makefile2" at a time, but pass parallelism.
+.NOTPARALLEL:
+
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+
+# Remove some rules from gmake that .SUFFIXES does not remove.
+SUFFIXES =
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+
+# Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+
+# A target that is always out of date.
+cmake_force:
+
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E remove -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/bemdeppi/ham/thirdparty/bmt
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/bemdeppi/ham/thirdparty/bmt/build
+
+#=============================================================================
+# Targets provided globally by CMake.
+
+# Special rule for the target edit_cache
+edit_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake cache editor..."
+	/usr/bin/ccmake -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : edit_cache
+
+# Special rule for the target edit_cache
+edit_cache/fast: edit_cache
+
+.PHONY : edit_cache/fast
+
+# Special rule for the target rebuild_cache
+rebuild_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
+	/usr/bin/cmake -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : rebuild_cache
+
+# Special rule for the target rebuild_cache
+rebuild_cache/fast: rebuild_cache
+
+.PHONY : rebuild_cache/fast
+
+# The main all target
+all: cmake_check_build_system
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/bemdeppi/ham/thirdparty/bmt/build/CMakeFiles /home/bemdeppi/ham/thirdparty/bmt/build/CMakeFiles/progress.marks
+	$(MAKE) -f CMakeFiles/Makefile2 all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/bemdeppi/ham/thirdparty/bmt/build/CMakeFiles 0
+.PHONY : all
+
+# The main clean target
+clean:
+	$(MAKE) -f CMakeFiles/Makefile2 clean
+.PHONY : clean
+
+# The main clean target
+clean/fast: clean
+
+.PHONY : clean/fast
+
+# Prepare targets for installation.
+preinstall: all
+	$(MAKE) -f CMakeFiles/Makefile2 preinstall
+.PHONY : preinstall
+
+# Prepare targets for installation.
+preinstall/fast:
+	$(MAKE) -f CMakeFiles/Makefile2 preinstall
+.PHONY : preinstall/fast
+
+# clear depends
+depend:
+	$(CMAKE_COMMAND) -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
+.PHONY : depend
+
+#=============================================================================
+# Target rules for targets named example
+
+# Build rule for target.
+example: cmake_check_build_system
+	$(MAKE) -f CMakeFiles/Makefile2 example
+.PHONY : example
+
+# fast build rule for target.
+example/fast:
+	$(MAKE) -f src/CMakeFiles/example.dir/build.make src/CMakeFiles/example.dir/build
+.PHONY : example/fast
+
+# Help Target
+help:
+	@echo "The following are some of the valid targets for this Makefile:"
+	@echo "... all (the default if no target is provided)"
+	@echo "... clean"
+	@echo "... depend"
+	@echo "... edit_cache"
+	@echo "... rebuild_cache"
+	@echo "... example"
+.PHONY : help
+
+
+
+#=============================================================================
+# Special targets to cleanup operation of make.
+
+# Special rule to run CMake to check the build system integrity.
+# No rule that depends on this can have commands that come from listfiles
+# because they might be regenerated.
+cmake_check_build_system:
+	$(CMAKE_COMMAND) -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
+.PHONY : cmake_check_build_system
+
diff --git a/thirdparty/bmt/build/cmake_install.cmake b/thirdparty/bmt/build/cmake_install.cmake
new file mode 100644
index 0000000..5bccbed
--- /dev/null
+++ b/thirdparty/bmt/build/cmake_install.cmake
@@ -0,0 +1,50 @@
+# Install script for directory: /home/bemdeppi/ham/thirdparty/bmt
+
+# Set the install prefix
+if(NOT DEFINED CMAKE_INSTALL_PREFIX)
+  set(CMAKE_INSTALL_PREFIX "/usr/local")
+endif()
+string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
+
+# Set the install configuration name.
+if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
+  if(BUILD_TYPE)
+    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
+           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
+  else()
+    set(CMAKE_INSTALL_CONFIG_NAME "")
+  endif()
+  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
+endif()
+
+# Set the component getting installed.
+if(NOT CMAKE_INSTALL_COMPONENT)
+  if(COMPONENT)
+    message(STATUS "Install component: \"${COMPONENT}\"")
+    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
+  else()
+    set(CMAKE_INSTALL_COMPONENT)
+  endif()
+endif()
+
+# Install shared libraries without execute permission?
+if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
+  set(CMAKE_INSTALL_SO_NO_EXE "0")
+endif()
+
+if(NOT CMAKE_INSTALL_LOCAL_ONLY)
+  # Include the install script for each subdirectory.
+  include("/home/bemdeppi/ham/thirdparty/bmt/build/src/cmake_install.cmake")
+
+endif()
+
+if(CMAKE_INSTALL_COMPONENT)
+  set(CMAKE_INSTALL_MANIFEST "install_manifest_${CMAKE_INSTALL_COMPONENT}.txt")
+else()
+  set(CMAKE_INSTALL_MANIFEST "install_manifest.txt")
+endif()
+
+string(REPLACE ";" "\n" CMAKE_INSTALL_MANIFEST_CONTENT
+       "${CMAKE_INSTALL_MANIFEST_FILES}")
+file(WRITE "/home/bemdeppi/ham/thirdparty/bmt/build/${CMAKE_INSTALL_MANIFEST}"
+     "${CMAKE_INSTALL_MANIFEST_CONTENT}")
diff --git a/thirdparty/bmt/build/example b/thirdparty/bmt/build/example
new file mode 100755
index 0000000000000000000000000000000000000000..657597963bcf7f09e0c490fb2b06178be6842749
GIT binary patch
literal 72600
zcmeIb3w%`7wLg9)2@sG+c&PXu6%ZeUOcFw%iUY~O1Oh}7C@OH6%uL8gUd~KHprFV@
zi2<cPsV%M6$B*9DTYR;sRiknZ_S!b}RjOWZuA-fQw)m#4n*Vq0z4n<iCubsr>Hq$I
zpN|K2*4b<A=h|znz4qScoVnIhIy24T(A1TtP17jWw$v?9@~@`t7#>gK)(W+Onp+#M
zjn(=iRX+rk&dqlU9^q^XW+E+J=?EDJg=rKc=q6ym(?v|v6jZenr268Qn{OpN{l_w@
zpr$zmg7Q&Y_dO`0fXl^=g4NU@T-&Kp;&O_=)I7=;<9spBr(hPhUqRJBDuzM@(<{>4
zh*8{4Kr0r1<4{#w_FgnV_eZwK1V^^G#f^doxSa~Be3u~~wex7NEKaz9>r2L$`1r^S
zY8)okgenX2C)NZe)P(99n<q3E7ECC}&yLh*=Zbz)`IL6nyoDkoH(T6On@1rKj_7S)
zSAMqif%MO3Z}@usZ67TA<1I^0-MJC8=MXj_R3SW!@C$@h2$K-%5xfXg4qYcBlp#bY
zf~x?bSlnsa4Gd`r{|~~I2z3ZQMQB3Ug+SL*gf_atwNnCZ0N`qbQxI-J_yGc4mmy3=
zs6_Y~!d8S65FSPNB|<I2IS8W><|2$i7>huCUW0Hw0`>7b2;&j_2!#m45q^abMEDWH
zYy`SCNFe^!;P1a93`1CsK-UiuZbz6!5zBQ3$4>)XiEszUPXt_#Fd1PcLI{EM?Jk58
z1RbFf;cf&Ug85p4$UO)bGqN1;e!kCTxDGIkuo<Bl;ZB5;5Jn)Jk3iQm2v;GTNfEwo
z1SB;+7hw^?bcBCH_@=ABB|+2@&qQ3w{pGlS65$DirxDx;EeN+EJce*90$q<tpsiq-
z4|q1;6G%hYV~UsJzQGhX-+mvFwI;+AzTaZPJ!QJj!2L*5d?MdF00SocP~86p;UUI(
zOf+@7fNxqERwx4CG=!@;zMJ8*fP)Y&LYT)m1$N*jM@0a?%l8XSIJzCk@n2i;_aT0X
z3HJcsf6s)wl<yxm;g;b(&lIQIYL3rfcpgJDn1jf5CdAK8_b%L@%JB;Uix4ItI1$c5
zpzB%*wC#Y~_}&iOY(idOy1y6q*{1jvxF3fwpK(7i(dc$C$A>WN2Y3_0H3*9tw-Im*
zLKXszQ}ZcVY0o~KmQk9nZMn~x-QSrO7@p?1?!>C<wlU7Z>z!Ir|KS_U-OrZxAMhj3
z$!^!PXJ(zfB_s2Z?U`+phM#m=+IbCw9&+4q-`TF61ExReTYolWGILO7$Mm$bH=M1x
zPt^v_96u^Fa0-b>=P9Ey-Mh~j1tw*ku{=|AzdE>ZG__?c0`-e78i(=XPSc1N#9NXj
z;sePs@twwU5<&rjx(adg9ThP_WoHPbWQXLIE;qt-1d@Sb1hQRp%|;-7B>5#?Uxq+>
zV3vI)vt(z8w`3P+e3l?AMOcnNmrnxm*N?ve1hNTA{#N3?7NG$_=~5K;>T1MI6T&Kl
zW=5`IXqJ0Ik&Re~Kyr5-0$rOB$d2BCfU(h95$IC#NU}%rc?-g=2xM#Nx(i_^0?FY0
z2&8Mf5PpX63j}pNh#MN*M-U!E_!R=_?H&a4W!6WspL9JXf!4<GwC5ka<;8;NT_^4s
zT$KK3)82pl;bia5%VrOrICH~{KlQ)dSvg|m7hgqxd;guI?x-6S`{KdPpLFm1w+SEq
z;PGuA4bA-G)J1>){<Ph{UVdHY>-j^@z2UQ&7cbw?@%8$#d28PM-Mc^j&sz&t4}aj*
z^C#aw_U`c)&f4+knLqpey=OlCQ2Qr$PHn4R_q~znFP-uJ_jXJz%guSPGB$5>=hC+E
zf4JzY8Q0$W<|BW7q4oD0|MBJ<qq^Uo@YzGxznfS2=7+CmK6dV~VSk!1aBSKyYFgg=
z>g->B_Wt7)GfD>BH~Omap4Uo$c>QI*A>r@dx9*me?|Rq1a?g~_550HUJx@;Ca@HFm
zr?3C|6?;FaDj9snWiR}0<+>{mym<e>oge&QYx(v`gLf3Hj=Xe!;pCZL?6@Md<>rf=
zlULt9`QT&wuU>rms&|Iy`4<-N_|-fAar-kL{o(8OLicTtjy&Oazd7ZUajvX;KKkV4
z!@vLJl~Q+$_mHRS@`u+)$BdtI-ufL=fBtgiZI`ck_19<IGx^iI-d=Y&bVujFwLkm1
z#L?gLcy?g!({<VR?sPQ${<7BhGV}KSB>kmBZ=G}NgLB?(t9<01b&nqU`72Mpd&}Ap
zJFEX=<U5mJJNU}qhF<va;cIKZw|Q6bIhQ}NGxp;BMYlOFxV!Dh!M7&d==x2=Z|^Vi
zy!OH1QSbcNd(Q_ya18qXiPh(S_=DwtxOncchrX_#{j1!r!loBCEuZ%CjG8f%SBFP0
zTsk)L))RxL-g$!W2lL;V_0i0Wwa@?Xb#T?6Z#wjB*H_MKI=(KO{Yb{PotJ<4(eQPb
zWdA*D(5~el{AJ~}cdgBR<p-}EeC4%8Pc@9+eqeO|th2xRbfA9qlJTocR{m<$)w-|W
z>jPfi^tU^wPXF<~?_4#o==Do(e)EwXzk4(~?vo2{IW%g+iEo{AL%+{jr*yIS*@6ok
z((kYm+tcmwSxD%#CPtj*1Uoz}kI1jFUT?xdu*1(zf*+S*pMFe|^uv>+C)?_@HU{yL
zN%}h{37+hxT{&Gz+VjUG<zysD|02q_<IkB%`n4wspJ{HiD<_ns{r4tGzcESqt|aXr
zog{r_lJt5K{@j|RoRgE3^I8)8T}kjqlGOWN68w=Q<pj>Bb<Hqs+y*x|DAz0m4`G5Q
zd~L{0QL#QFxDe$J$Ew2p&~l1uS?DKSg&1E-KRDnP@!{ex=C@NN{;g_>M^s$@ak|7W
z4@$i9Eq1_3qH@x2koci8x%N{|A6qH$X<YArL9r?Qhg>hb7;*i4q{QEMolLLl-Gl^`
zeh#NsGWHpqG{SGaJ~90t&Xo9>D<q!m4qbs0CEmTsE#l;}(RH$u<aV&uaHX4~#+3s@
zL*>7`&Mk1NUm4>hKFsCJ;2c|-pPM$z^r~Mg&yndrSS#^9PXFsM62FD{>16zm(J-p_
z^Hnl^5#xt*zuvX*X9zNmK>GE}{~)J7m?_Jt;`Em=ek=3yajy3&#(#2=O#jG6x4@9k
zMptm4#AkDRFs$M_!1(6XZV^}fDI6x#_uxMqPU=@R^HYuQ6A(PYzgH*Axk8ktX>Xk;
z@lSL6$1;8n^JfE>L-vub8&8tyE13T({T)1xpI5s@I`aSMD$17WFRgcr_+;@HzCX8f
zDa)H0$L1WFzJbd@c5&T(ip2kRja$T>;;*K?!}9+Q%gHRpPwy|&4`F$@mhsOpKl5wd
z0z*C|U0X-X@m2eQ7f7_`KUbDhe63qx3K@R|%T)*W>t@CeV}8EO^(y}VlI5yD*LxbL
z_psdVWBg5w|B(Be^F6n~sB+GQ0MIx(SWX~L;vzqpbk@1VE#m6W@41~1Z;*J^{#(&s
zD(3|rM-(friZdkMntnbQLg_!gUY4WmSQjQ>!hgi_IhV^hIalI4S$@uE{1<GO9_8_x
z!1&obj@NTJD;e)#d9d2s2IgmVm0P4#`m$$$toKxIr{dc<mbY<SufqRzl1#sb`7ns<
z)rUy@T}`rlC*!A`Eb&%-cQJoH<@PM#^yAqMv~hdT3~@y;Sj4w1maBZm|Afn*$9z-m
zESf0Of6aQT#;X+yn1y*K=b`0Xf&>q<9O?~jfp~%KbLJRX&eUd!pT_tD<0Ssa)r=Rp
zu#d;%`xeWaYR`91km=9m{(3n5Gc3=;Z;<5#8DGhEZ#K(w8RLrzWI5Y6$n?lAuBmKS
zXY+We{{D@}WokgCS8{kK_(1$z%ltgZ{rVA)?^`@yzs>l+K=3I2!40yU7e%dDUt=H$
zZ?$_j50-cbw+GdUtBdE;!+y7jD>?j4f0_QGYb1W0pkfb$@y}WK`D3<A=dc}sc!<l%
z^1PMp$xOzd;gscD?YSQnQTv;@-a9${b5KCSKfrRR#_@L6FRR_2&*Q#;%TfLPi1qHJ
zm2Oe8vSU9zTb93glf<j?CqkZw({Aq;iO&$ZuwLi!eSzg+7UTCKBeiqz2ALjaP+Wh7
z{E(b4et0=!B-qaUzct_%2xU*Uv79&6OZ*iwxppe}IRYhpdV|C>TDywnA)ooPm<eOY
z%l2HoMy3Z_#MOn4P&?Ct62FY`e`3A1+L_x}uB`La6~kmX*7@-swkNx|d?n`(UMSOB
z{kMOEeaO;I>tP47M1O~Ahq=FL+|JjXvK*`3tKt6M&hn||mtibFR=@5AmXmQ|$%hCJ
zaDUdjeN_^#$~ibl)@#+%&)A;a#qyxikIa(kmu-;cxHNgCpD6KGJ9#0?q1Eqti0$Wn
zJdRKkac#jo=mg)K7W+Ae_3>(!t5KZ3k;mQYPkx`<X|<F8fP#>mSmnH3*`FG>C|T*z
z<J>Q+U-~)PN##7jdZ*-JG!%sJJ2uO5l>Fbxa&kW75fxVzGE(|Mtk=rkmT~!Ka(_?f
z^m*tn`MnP=a|_I29$z}}p>nKxI*R+ds7|K8nalY#29DCd%I%rS_z~RBO)MwHjQ2XE
zKXZ7AEQirrF_*uY+o|S-N0ESRX4%rj^e3`?u=;^ZxE$;J_3M7JomT((#Iq#6jq6o>
zz8-uay|c>CO!gbCe(VS?-|A03j*bw24z7{ySN!=hTjHlNUfG|1?1w+mB-77A-|1SJ
zE7Q+r{hcGxnuc@CNHpNT@IpGfEYNg)MQweZ9*O$GQC-*c*-iR_;EGTr8VncJ_#%;D
zMAJ(u=IVi9n9&t;i)!lYf)&2XnxLfFQ}l*tbvWn?=qrP(bwBQ+NPfk<a$kwxQ|$6I
ziAc0CR3FhReUV^(NnI#v;OlE5p0b$>%InK@54EwmS#Jo2BlUH@noxAL-jpMAy9$vt
z<VW+vp}H0L8}!wd_{*a%m%rK<)}vuxC>lY1qEb(h%L7uLUV<v;Hw5cy>jTuJKusAM
zqx+kEdL{Z;r~7?Te>Di@(cBzeM=$DE={3Q+713%v7!KEm%jWn6H9uGvsIjZgvw-T6
zO{?ctRp=Etx~IGVJQh_UuTjyw@@Rn{)zsDJ>5Fn*x!}Gp0H)Q{`&Z`G)vprFDv#En
zE6MtnyvE7;f;r`tCFS{g`6Ru<Mf}7lM8FL610)&`fub8=<-iafM2tSz;F_%1P>J#M
z)u3Ir!)N4<9^1gEVV{Glj%h^mz+BPbx>dEz*Kkk_81)FvpA@WZh_2>7DWQrBT{(tC
z^boA=$yvK$^jBaIC+SiIz(#I*c{JZ&-xw8qg!DzLLlF`j45Q1X>yesZutBe?4}0Px
zT^MK#`=X)xx{{jm=p=DlQd;BkP{1{>+@*t(t1D!Vx`-<3q3#5XyIRuhTB+2LtHV)>
zuJA)O2<6iBg>|b!5V-}xNMmh~h<;@NNbsSY&7r7P8?3FZZwi8ylz^>5tuItZ)nIg3
zN_%O$YGn>uISDMOi%RC_*VWhh@+xbilcJ$o*cO(vIa8qAb&b%7W>;QaaFrA;U$j14
zV#lh0l~v~IJf6Z@U$Y(wtqJmg#|Jt;5{y>GwSQ)9w5rCpB7*VfAuAA4O%*Gv0mbM;
z6FgJ=^>vWSMt_u4ZZ>o%Uk5LWEBrOCT)kX3$W@Sth8#W8;IEks(OBgR2TI69R5d_H
zJ&-*}S%qHQFu#1dYGkgyDCnmqlELgl>(C|98iV^bBWlND%UBw@YC|RV$!kb@6Qr3;
zay^8rPLHl`2$rDndR&U6ZG_BE!d&ChD{_eeo_tD4MyQZ1*bKo0lEEgygX8Faq2Jfw
z^J5+%F_Dt)k2LC4p}LTi1{QO&{z^+>t0Pf8RNGLK*z_LCn;?;_zY-@^J5XAg<0-9#
zk{4hiK%rn^f|gU)F-i;O2ou1pLAzj|a&^5ngqc~6KM;AOtvaEE%9Qm*V37wlYHqw-
zSU6XnDpL%Xr+i9W1dD?Z@A}ocud%sA^aCm!khqdQHK3_N#}FrQJ}w&e@`ZT{#?dTe
zP0$!{EC&K4CR%ACiDju#nRr1Y1FS&h&_x;r;ZYbi<=DFt5UsC}5^h*ZR}Q?gC|t3S
zKayYPt7WS;$2LJ>lP?_d)%j(j@~B}_h$}tqwjEnpN=g3aSLh2!E(*dyAz|TQ75qPx
zUQ#|;XT@a0nJNY=A0QP9UYx5Yfa5n7t||Il9bQC@&mV-|`Bw_TH|%;74740LgUhgr
zux>p)IoLOo&HDz!Q2-MtG}CM~XJdXK56&=+DAAQe6RO@&AF7K&_Mu5q6N>bb@?sr|
z%ie;L0aspvNlUb5c49&DV0J_(j+RXc*3k+;4^@>cj)bWpdAbg-N0pAYzyoOV)iee@
z)F@@mrB&m8uneVa@|~;<k(lP1cn;-SU>j0wJB9h1j7gp5V-Hm0n2lJ-r1{V{a})bu
zOKEEH#M?7T8Wck?<5snY@L^2N-kOo+n589dL;~e`kbq;A6vMoz`7c3Ij%Q*fxwX!;
zn2h0Z6W8LC@g%g`ZvLHTYj-gd&@!nO%MYxol#<oflmzT_DbAe)TR_@_B^IlC7;6RL
zB$2l3P%a2YJ~RuSV^c634MFQlu#W6$9pF9zudIVoKGr;8H_+4q;aOnHqSPLwM#Hht
zg=^7R2L~t`!U}s$5Np%=2F&raDoTNqfm%_WM|21#i99;^qQ+u01wLJoH2DcLvRdPp
zlOCWWg*|(iy8f9vN3xI_Pu04hVFe)h>SZYuAiXoqHOb_QCX<xO7nUx<1v8135I3n9
zasTN!*Djc5Xg7+NYe7#LbX6)}pVn(EHzw8X^O|OM0?&_nO#_i#Aa-x;yvRQF$>jdX
zU8P<OzqCmB3+qTub$mV|%_o}#Ga!6GpYZnkFe?eo$Eq$z#|#Ne0D-`E4ehy^EoP|a
zK8UGp<bc>XM2X&u!Sa0DWu({$OZi$lF4(j%P^?eswuUMc)5~$MeNAq*l#QcV<o0H3
z5++rThiBA;<t_OEwlgY3EM=}Ae5Z0_9V*;E8`q;icVVJuCsD5s)?icF?5H*=Hx)8l
zCiY!Q=3^HufKB1XNKkIJ{ma&r#-b%IQpam6jm4ReO|`e1k|`aI515po!@Zapt98)>
zzNk;`FH#XHRhRf8fLytzvfkF!iB2!+f{l+L+<Y>Ik|u0osvHMeu1U9OhRz#(De)Xa
z&7MS;v{z5wZav_!*_X0WsXnj!?6#A8H8hE1l?t~MlRwL_{icemFdVGG77X^-U<Bmh
zkJwo#kNWGy{76ShYU3wC7yfmd0JKSbyqf@VO||1VDQoW0?XMmGgfyk0F;XoyR%juW
zFPE&gORa+1U;tZOJOJ1o^M`_U(NI;$5BJG9M@+dnF>mNk5*zwtR)k%ly`R*dAi(^a
zjtLS@5p47iQ(7UpX?*AqpVVd-H=uE7DR06g&TU2FWTBW1c<QF%_z?_7GwJ+bCw0c!
z7;fk{Z;0k$Wh6KGY-I(Sjrw_6Zm-;veOOKVR;UeA-@K$wcnW8H+Du##jOrB)@sloN
zuTnZ>Ne)#Md&nx=c*02*AK%~!#}7Yf>7Qu8{x_DGVpAgd7DjJN%#^wicB{f3bL2NK
zPx>|}v?@xSLgH+vTvyu&C8U$Kn{v`KMmjn^jT;fedej=8F(1U)s@;^tX_7i(J`0~>
z9E96$U%*y*n(WUj5)XB7q;9NB#j2D%G;CrOgne~^`dZ!R_Xi^p<6KOS`c`;^k)*XO
z){JVKuJ>tBX&f~RPGlJ!MB*5d&%$i`A75RiZUD{m57b#ql4wyX4wR37_MRLLMuOp{
zpma9v4Q;N=@S-qHRZW>>a-`<jXL05BOldHjpCE`Ymr}Qe>@o8i>^_#6x8x{iUcC}R
zDxSPz%X{3bd;7yGqZ9GNs+3yGLSr+owrq~t);aFJU@<mVO_D(!ik)=bC06@%MqP{Z
zFKM0(VKE<~=J^5vu|=7qOBv!rV%}6ewiS}e`eWC_daz(?WBa^Rvh9s4uNsdR;b8zA
z$K&BQPbGv1ez0B~6zB7OI*X^Hc(FlO9wi5>FQ#(A9F&25KFeWPugtY`2g&MM*cvy;
zHrm){=B88?OcQ{y_j0_OICwb2&;wy-F{|RZ1`eBg5~L5Cvhr|QpBFF0?y(O!h;bV&
z)n+ml#SPdFla4Fa@u^sg=ZiB`VLjqI0||%Q*j_)rLv@d<K%A9FLU@uv4>r{Mt1Z6w
zzkD;aMjVR5V5#Fx<6#xnxLOz}C=e#1T)8&u+X;ZN)yK9Ck1g3LnbN3~E-FmB`hDdm
zDL<Q3G%?wOQ`V;vY}ksYY=v3O%U8R1>U>i^IaQ_C)d!pX!G<W#I{TnTaHrKnb#fOE
zE+Wnuus<WJb)j0rQ`LRBDLg_+;yLBjhU5-OY7QV_Xy)qv>L4C_!4uFP%<uH*bKHae
zUpeUhhE*axAOkPi{-B%qbNtg9*}Rn5_hfqlLE0lHZj(JcZg(o*{9NpDS%tnHy4iX$
zrn9Z%+5YHbo64iUI-0&xcJ!*AtWBYO@Lq&?!n|R1s?Hs-$s!Gwcp5_(tK)JV$p@F-
zqj<6dJ2eS*A_Wrx%>u@JW%Jxl!b7wwVbX)g<5wu-8PGg>EZ4U(D38fQrYS`|MBAsC
zrH5A~wr<3r38P@EP6fOeP^&PhQ#JIEvE?NL$sw9ug`4g1RFX?#dT4~VTnG#Oz>Fpy
z%a)^>GUKPzKc!B%dB+4#l#7jMI23deXx_0unq^;aY@^2_Q~ijW?VuX#)??kbqQOcj
zXO($NX`!y1gvWU%;c=c>r6n_p^xW*+>`B_Jq9Q#nJ4fOcFUEa-wrfgav`{Nrym<PI
z65W-ZXQt3gZ)xJ2g#^<PGDJ{VqR~IX(i^cd4W;Xcub@iW04)=D4*ZjU>F!_i?_XX1
zzoyl)RBC&A628zVP^vf6=^aT8@7dxzo2Q2GW^JbS0N(qji|7YU^s;fQPtfo+(uYm-
z0eI&$F@J_Be+EjBbSf`&<W3A|f9(n6r##gn-cNgx<FEX3CEmbH*Y<Nfdj2h$H2&{$
zeMf$|oQe_O!bP);OMTY|nT>D%5pneH<pj`T`5jmh!5^myuim*P+ISP5%2HR33Ey8K
z0SirdszY6F6CNEfF0Tm>MjKa|36EE>jcc(9zsSI9+Hw<qunAvn!VfXw8%%h-(r#SM
zCOnOYy4IWU>Ro1{Z872LeO`5KH{nMrB;XDcUcIAD<hxCHyi#pkyG(d|n!vbroA7w0
z*SPkW@akIxL~JwR>D^;>?K9!kHxP){VZy6-y$QeHgjere6TZ`g$0u2gD`vvuQw+v+
z*o6O;fyMWgO?bS*9KZN|cakY*9IIuT@MoLwSth)CXPn5RO?bTWY+T-rA7px4`>m^R
zYTnIl(SETDp#|QJdo!PxNxCP6LFyhG#ou9Pxp7DFY9e-ZcH<g*6~#%lyE;U?lH#OF
zU2P(M1;t54yLOBC0*cey>0Nh=_-u-kigj%l@fj2+)#_R=;!`P3-S27;@mz|NDs?Rv
z@e3$UD%4db;^$DDRHw@=;-^v^wo2>D5%CceCspZkiufRklZtd@iFg{t>2>ukO~n6E
zjyS1G*Ws^$82d5BNkzIkMf@PeNj17UMEo6!kDz#)h`&Mckrdx8;xAL2RG{l_5&td4
zPo(&E5r2;2CsBO8h(AGbQh}}p5r3HCB>G*;Mf~3=P9on`CgMM(IEi$ZTg308IEivs
zj)>nvaT4J!r-<K3aT49GED_&8aT3`sO~ltwoJ6wg@K@CTFvXn|?-cPF3!GZjcErWy
z`mm(skITF*?|3&JI9ygy(%wd2wD7h+OCRL&wzt#w31S0K@TXP7&ca6o_Dp_gxwg<d
zfxe65-FRfEx8=Q6X-l8qLxqt*e!BD-B6W8TJ8M1R_E1cvfA&Z^(tBH;^FDj<Qm>=K
z`^O`V$ovE)^lp9`nY}G9$o%hc{`Hr1l{+!L4PS_yv1`zwr7!e*9Vo{q&y#$6H(s)M
zJ`zOFG@3AK;WM2NqIE6rb$2Z7`W~_jJ4@?|08$y{-s?Wt1@Uirss_FCwme3o;%)cr
z4^Si8Jsl+4-ff#n_?{*;@wT+b8X+%?qZZPn75gc;`80h-ff781h>@T@Hcm$8qYR>+
zOjT^VT|~ESCNY1yd5g?=5v6RM+u?0pxZk_&F`}Uyk{rr-0D6R|z#{sGF9lT<n?rfq
zJux&=J2V05oLt}E&<E5fqN=dhsk{%Mhu)2zn8VxZkyT<4zT=6ldJegD&K(kszg<MP
zZ7u`$=`GyaYp4P7))EaF+qks_vb6$>%FdRp73Cnhfuil6PSIK&*|I1%2-DHp_RTej
zkyfDHbPM`&K^N`Lfqog7_E-e!fOMh&RlP_zSoW{e(!b7C+mX9L6nnemtw<h}!zu`B
zSpV~%reS>>5tS1yoiC!E^{89+S(I{?C<Rzm#1?ZAKU8^;e;60X{Mrhu_wlgdn-^2O
zRe~1J-QH~oNc~e7nVw|y#Iv2q%fRh8d8a5u4R(9%dRS1SL0|j>eY%Cma1A2f7Loqr
zVOd;jd+go6Bh5}uvxqV%+5R(SP;73Gy$;(QuUECY4b5t!X05kv^=^=N6ZxhDvTC)G
zpbl`vX!Ue+tM{ULHm#mOt-gm_eK$o^tItL5CrQ>!O@Bt{g`r)$5pCJu^%5m&c^>I<
zgv7PSR*1^>Q+t0zofkwm6K8{X2zHDzC{27Io?)etp^P(}V#=U1u90vT%R;31wZ}d>
z0^BHpBVlTfbrOPWQXKjcGqJ)cD-i?E+aj9#AvNT>y=367T>3oOJ2(tkE$hB)X*{xV
z-ItEW^R8QmQejZfeCD!;VZ}~I?1}NPXHQe~8T!m!tPr>tMdqg|=`&;nW4qDvrO$5{
zMYgOv;%#X>>}~ZRWC0+W#nlmw5Utv=rnVL>3zThdUHBymJ-eePZeSeO>iO~uj#00@
zXi7`tr_`EOt##q2Ew6*8PtgFhE<DnbMtIrFq>0zsBsRB)>XsbY_<<AE#abE<v@GmI
z$+1@V=oSy!+=)7u%3@;Pjn9p4dBNNAr`Ywtv@Yyy^&Ak58r35~Atms9nb<yR@e>&s
zji+g8CX`KNRQ)JNM@CaKysb-Pv0J_%$xVlEI9iE<d2HldxbXuoGVH-<?ndc*Flycw
z%naKQ(C`a|p?V$^P>IqBN+(f<6XpEph17ZWdABV@ApI8xaX(EUt%z>i>s3{6{2&Jv
z?MFo&Ac@jL)Oaeh)dPHc>~a}DjMAu{!(3W9(9DMfxzlJEy)Ez`Sj?TW>z)`D)_^vA
z>>YMN8%!0XaT}7xi~OZz8*1=$!UlRed-9pexDUxF`(d<@(jKO?hh<vgbs;L;2VU=s
zjr*LtBG~gbW$rI~WAO5aTv(gnOp6B%%MvUqWEN40G#LB9BXDxz9&gKB8kBwBb~0n=
zE^b>F?)J8nWlCR%Mv9Qz%CgY84j4|7t(P%0ioT!eqd^Cc1^r<Yop?|AouFe}1^pHi
z9g-yI<3R^o1wBkdPw0(pIn>eD;cIg;FCdyOlD+#0C)-w*12lRg(EZ_wAIc*mCyPLj
z+hboKtu$(;a(7U<(yE>O8Oll!rYtZ@+L?`QqKHRjbw^aI==*QRsG^-vik3#|F1pzs
zn~FZr$P$0R7mV(<g*3`YBqW%=_G^+k=$({*DFKGWl^MfD?M~E?XL!8#px8Ff4K5z0
z>hBe_thswW9=2p()v)o;%C>AH7m&tLv|*6cB1An%m*v@Gbf>p!y}eX>k)SQ6;;8G1
z)%{Soc2s*pbtILG(M^n>G7;`fa`aC@ztA^Cwv~}QAVFDX*WF~8C?ZFFH;?)fDI-Xa
z>9~x%GbGRAganaLboyQ%m|ei_LQ~qn5J(jC83b?&T&7xSsnI+@==Y%$X+81BgB#)W
z4SbGJ&wV^>F|=!0QqJxsH%vAXbuUF@4duBjvHl`bEKxtLrHF026ym=H;&RU}!p6W%
z1QIUMjd}4JMsp1y2?Z#$)Bw8d(Z(Z=Mkf>;-D7h*!x9~TsF<4#0<laKxCd)rnN9Py
zP!_@c@un0nCZre-O<JtXveKf&*;>fEJBt)Un7sHb=N1xg($c%B>=x+9#&t)u#<6k3
zlPF!6kXHn|-}Q58$&S!)ZEkD47RV#sKZ18ch}ftI9@dDG+ha`-OyY5yN{Zn&8X!{6
z;FP5z<ziu4hZJFr@bR$Ag=$Iljt|c-$xDJMI7HkM)%Pg#uh?ObS}P5;)f%{+J-B1=
z(mg&Y4c5xA9w>pPY6uVH8Kd7cUqm*xjV6J=79zv~FB4$Vb~Hi{R?Y1co$IFt<A`PY
z+YSE?DGsBRoix6f{t~<xb2E-$;E4O8NFzh+_P-^z?r2)344t((XZLIl&*)`_w1FX4
zuw`pQ^V+B-Qs3HR3%Q9z*hj3(XI&*tf}J7jF757xB4yUxUaf=hn}mzPbF^W1-Du8!
z_F(o);V_EskRsIqer77Mvs#5h^2o#t<k?4j-)EFeqHqKyV-A9&B8Emt_D1fr6q6QD
zjIzZNvN0(>r>T)T?A^2E=I1^ezoh$8@PL?3jYTa9C9Bj*5bQXAQ!>uKiIG#PDLDTG
zgq9FJIRB7H`3;;OO<q(_&eM*~7UA$d)ePg=a(K(;DB#nyr}zwQ?8O3~0!>>O>xnM<
zci>QW_xh(OofaL-6!->NPY>k9Vgw1E9?Fp_M9hh@7(dUPXgWnkGbP_lIgcV|G38V<
zWe7#iWr}8|d~y(!^O*8JMO0({OpzR>ykw?4Ly@UWdDu+3pCWUavcpVirN|XbxyDSn
znj#HM2_WKa-AoMjZo8fM$b!;}MsHWVfpVRO_@Oi8%2g(7j|~8hU9}pz<#MjuOT&h#
zMp&2W-<`f_`oifIG@M@Av~GE7AGtU!kF^1qM)Qu3c{DqKm!o--Y#k-UAbLs3U+L=Z
z-gwxtsV$m~`UGWLnHLSETr|?${(mB!!P;vOQK?bJ-^uc{Y~DjTxD3L|31{<ei58$m
z?!m>k;yU4yLee?WPEISDfg+aJ79lnS<c=EkV5g~w($@PeMf_b_HA^eRo|xJigH22*
z;z3Ih<86x&+YP)MhE|YotvXWL`k1AN%V5?N6)Y0e7TN%lI}~p~@f5kV^(jl{_oaDA
z>;g8gcmr#vDYJ?Fw{fmrlKldhcEwh3E+!ishi#>n+%0nn&;lQcx1C&^i^(e2AkMWz
z<`Tf<bp8{I3My3K<C66>{fG#S;7?*i&ysYCzi*PY5r2_H*%a)5nu(twLOEsc+G9a8
zaX%9!x7%Z0Gx148N?Qc0sQ{H`j5*DnB8?bJa~jQ@CL+=Zkuj%v>pfE!=8H7<Qyu0s
zd(3GrL>g?&S<~ERPBVbgv_2;E3~U-NqMP>-)3@E;K|recSzsC28i{I;{Spb4tx>oi
ziuy1Ox5qRjhv0~EuG>pC<;v+xrZ1hoY`U&IR%2TA9B$P<#&&=g4j+aTn^ShX27_RF
zKsCG_tE<ukw+AJdJtr;(=7txfXxKeGVPZNLY1PJo&F+~dX8dHk7;8QfoF8%xwwSS^
z5zef&O}kLpQM762H?~Q5{g7+J>9^}z6nmzI|B=w?y|lU7LcTdX7;mP8!2F(u>M<e`
z1^07A5rP-Ax`eZA@~S|8gwlS;(=qIpHdyyT&)knGDwhAQ=%pC<N6iU_Z8|8I6oNEd
zOf2M5)7l=3Q<V2}N-j1WfWlpS><0pari#rdWc)K>e<rYE>_u6hh;3Es5b}u*$^4>2
z2Z$&u0T(!t<~{#8Z%c<)oO3ab7=hwMrvIiIL?-NTiS;jB#8Ll&_7l3(>yUY{kd~={
zU5L`&Nsm*quzDMBS-lyuEXPCc%pUQ!iedn<lAH2e)P*t;Q6mFyP&Vos*(geviDj4^
zI5gqVNYqXee1x@g8xk5r?@kysVr$$X-}_goMf;Gs!{R6J0<CITwrAh4JKK!HOzlo^
zf44(sAp-qT&f!BD)D+t+=0+|}mPM(j5DMZmI&2y*=3(4k2r@B1us^=7>_o?7$Q~n+
zX%q8&){ytHS`_-}9d=P0d)v@cQhwo$eu{~LI$>$P;4Kb%&}M^@Q2u_JVB{`l){yH&
zC2~b;H3Ij_4F}X`Ea~jDcSjN<z-e<t-%R^tdpP}mG#)~X{_skwogIBiXvOA7OPbMz
zG8D59tC=?Bf(#_c#!c}R!a~~IQ5;%>npELei8SA^J@ycs8<gLHR#~>k5;%CKENjTc
zc!EHz%aV-H7%`U|)d;yo_js*&k?@pB_My@OCkFguplKo@7POfe_hag(*tmsei?yIg
z|Jr|(2&h$f7HWX}Npm?3md0Rh0_A-?rC~!MeHRu<h_J;X0mm9HEYe)LUk4S(jsu#e
z+Q9W2mW6nVM2&~BF%9WYunWHy&9H6zTOd)<kH=f8dX}V>2{lqpu#kq3{E&(hYGlkF
zUNj7n2}9ljd6GNk5K1stYTgFX0-g}ik~0PWpA`kSiXBblYLDG2OO73yOtJIc7P2MF
zX`Kt<7R++OPRHJSiw5xd*em}<5LLIwnkl*5o>p8)<k!Y^hqdU*VnC@wt#iRfQR2Kf
zU*rjriO15yj3O~_fxSW=ArK;8p^-0s4#E2N#ABE)jRzNI$?i+h#EBDgrAAJZ=%7$*
zG>N3NJtogF<=#A^&r{{`32MK0+u$@Lg>JvW9hgq+<VO%Br9JkfaOl8dQvQZa2{CTi
z-A0+$$QI!+%oCF|Ah(DtWR;Oc%us+D*Rh(2f}#Xl5YJDXYJtlXsU&A<<u5Z)wOM?N
zkrWdg)YX`F_shu@ohr-f+DcO~8Z?$hdE>g6)_9RDv_1AK+TpRwCKUKHaP-hQD(xQ3
z3v$$AK%oI*)OHhAbcAfhT!nrS{fljSgK|6{Tl*$K!Kt%F8}_R6A-GCeX1lP9#*jq9
zI3B{Sn>Ixl?WVmE7zibk3Ar~Rx5akI?I-hO8tR#djnYy<ktHYE3S}oKA=wE!7*1>f
z)gv_GRMC7$wtN6cGU*JId`NCx8trqU-zcfEQ&^vtxl}S3EA}~o7tB4+)S{lcgjDUZ
ztUp`k*&}g_CbjJWagJxzwtv7#%C;zN!}th`1EHhQZI5*eH6_In(Hm)}kB&#*0Hy=p
zm-J-^BL+EZk5%m#k|5G9qqOXvf;3-5#e~uU(}yk#`FctiYR&6Fp13tThZ58CJ0n!h
zP1F!zB107>0E}>+pmK&VDaRWMA-IZC=@}%du01B75TVY1+#VB8G^mGuTEyXIfV9ix
zh$_+SDMC30@2v8Bahw8~N%X&rSs=?S@uT87dXDfBYU;-%{NM2$mhA2qn<n`TC$1CE
zQMAYY9q}Ym)l;G#k84j)cJV+rVqZZH9ys^HGo*d7{Qm|HCuDG2N0R9=@hL$$$S6VE
zG>AvU<l<WT-jk*C<Lx5um^er78eL}@WjGTwr)M9rt~4wJH%{hNJw~^Lq|;Q<K`b1L
z97d1ZWAxyoH0?q#&ZelT!%{HFrU(tZLR3fIChY_ALlcGwGEb?0B`tfA2bH(S+*G8{
z4<U^%L`(6wq7=kSh`4dD6AOQ%&m=R3qdE@>j1f<eCXbntgFJ>Vyx{4^5hdmgX<>jD
z7Umq0A;IQh^i1rB$uk;}P#_9}!G|`TAD^>e9_h#|dKL)q=GAlrXG5?-)Hv}qNKyhB
z=Ec$R;`3G6!_L}-=HPerzB|38d-|f1md~dzENOXddPV8Bu{q<Mc;s^1ghIgb((~VQ
zz3F}SD?AADL1yf!S4cKLjGn3Tm$n=%ZTY+yWps~v%e(P;hj+@M#)DvK=9Np;X3F#1
z<Mn;=JTV;4AoC-~^cIJ>=!5#&K#e~Fg<n3QZ;RpQd=Y%jn);f)W_-{_Gd^MxN0~oT
z5~n9%#ZRE1LcDm6k2L7`xrQ)(wm~z#grhmR2~(Y#vo4}JYa^O`*>mbt{<w+}$Gf=J
zFVRz9xJRN}@=xcb1lMoMo-X3<o86An(!X;6uFUs<*zt0AcMiPteXruQ0q`*l+u=c)
zc0cCyyYVFPJAiF~M*tUtJ_>VuGvGMD9e^_c_W+gy?gtD49syhpI2w!YZGeS<KLlJ1
zxDT)y@H4<2fP>&D>;bF>+z+@N@Ce{;z|nXz`&qz3zzhGYyL&lc1K@hVt$=p}b^x{k
zehJtKI1m2p_yL;M{NLT(Wq=O=HUJ&~+z$9P;BLUi&hG9G!UG-#eC{382NU=zU?JdP
zz{P+cWACOJaOeTF7w`hWJ%ASj?g#7uJOa23dxoR&bm519g@CiL)wdY%8o*{i+85mc
zxB_dnJ%HKR&F=*K>4)9jS>X3`fH{BzvByvbSOC}nxBzfF;7-8ZfWHOo06YLV8qZ07
z4Oj>`1G{_60oNby?%o1;6W}huzXR?AoQd7I7~pEaOspq=0XQD;Wk4^jF9GRE^+N0-
zZUI~XxC`*7fcpS{4;TYH5!=d{Sj^uBI3Dl~Kri5@fb?YhB1|e<02=^z0p19>50G}-
z4+F*kvvB4;0$cYvfcb!n0qHcL8E`$|4nR6jYXd9=>;&8dh@V2!>W-jZz+V7*0bc;5
zC*}_UZUO8MPj?sKD8PMy*?=*?D!@!Ql{)~(13m%h1^gJ0p0Ixsp2l{-f&aw#0A>U3
z2lN6S0bBq$8Z*vvz(T+^fQtct1K13B5O4>e>r3PZq*Ijrfb`_>5x}j0qetK!6u?5j
z-vcfNd<(D{@SLxZAMj$pJ%G0Xb^`u8pa%V;jbbO@KLFi;C%}tY4tNUSdcZA!oq$IG
z>Gx&nQt!Sv)-2E*%~_7q1`o*G?ii3oczP(mt)sj90!ql2eR@qB13IN&{(H2649w81
znL|g<9(Ka2%=Ox(r+(*xyfHMth&083J6`YZru5qMp;_0b6%FbCT~y5U0N`z)gC%i#
zX`H?Z=<9pQe;?=@dda^R^a$u+UA+9c@$wG<9RywURkd$^oKBA@Ufv6R4Cr$~he*cD
zFNx<b2K_srPcYM~<MaUN-d^aNKwsEP{r7=B8}w!7^3yIC6%fz&g1!ax@n*U|UjG5m
zw}b99(>)v07a<Ad&%pQ+-EMrwfIbHF>p;KCl0R*^tb@>0PASqok8~b$x-_@ZPFlss
zKu;zoTR}enda*hGwef!Z0`$y3b$4H6rmv86YUfL!j{<#xnT~#@c@x`7WXGDaQlTLO
z(&?un)1{tBxu>$uL^<?Z_8AxNbQ>~_veKOnqdym8K6UQz?#?#X7mBxc3Fz6NPcze3
z$LVW9F9n@mrZMVR7^nXT^qHWqGSkz-M*Y75eJAMk3G}N3o%-+==pTSCe(PAtJ@w&L
z*$2ud#y<l#A9KCk_!E5$=(mA>8f8%BknnK+V$ipPPER2k^t7*xb_PHXznMJ$CeZ78
zq2CAkRiG!+@4cWe0sU5U`_gVQ>OTN_2k18?&|3_827HfcZ*_OyoIt<PppOClLC|S6
z5bvKHDB@2s=zjryvY9T2N$@uS`q#bW-voN$f7#bR24y}9x&A+(({FcYEHcTX7i0Oj
z981!Pw~%fod>#7j-HebaUD?g)H)U+=x3&Mq0qM&!jDCHIdT)fU<21MXN<kLwJ{`Ul
z(Px?IX)}!c7lZx~=yv)@d|3ke)1cG6!8g;quom<uK%Zl#r_D6V{}Jf#^-}(CKqp_#
zPJf7RA3*+}`Yn7A;>TJOKZtMd#`%x|UvO_P<>Vtj^`i)7X1ru7=Nic?YChFhigMmS
zx@l%TAbrbE)VC<o9fl8?Y(BXS^tVA@Nd+r?ORF;Y@+j!%!{?lvK$k|2c>OBqi$Kpa
z)9YkgiT)|*p<d{tpxA3cA7{?*i|5Y=eFNxDGu<RN6-=LLrl(Ce+8+gd7xLSU8@2B?
z(0>YgSwj9fM*c@Ze;M>-a`!6eZ-YK7A%9$MKLx!XeDiA)=xdDfN5MxAfS#=V`Jmqj
zdb0LcfW8TIUqbmyjPj$PztBtm+dywiLMJ)f3%P#+^!d2Y=rGA++H6q=RZDz)3+23n
zbjjrGOVB$&Kc6xvISa@lsGf@;CvSj$4(>BvHPy4!s7BONjJ3(DNS91r0-#elW6kxj
z??CmCymx|rC)KmxR1eMF+hkqTpI@MyGOVTY&E?d`=`VpE0KL#mkHq`)CF)rV`a!az
zpC;Cm-YIpI>KTPK(=@ErXtT!POLM%Qe9)JGo^<>{Uj({cf2sT^=vVfV|2EE_Y}_6N
zy%_n$d+2K1D&p;X6?AI99UrLvPq}=TxqmhB{G$eAZI1jj?neK4%qf39=vO5vpT@HS
z^bMfDXwFY#`HZZC(3Ea1(w&PZ<>pz_701(2efJ?<3DPCw=U&j~fu3wU4uI|j9pVx1
zk0;*W46IReK~Ki-F`zqpDZd!>Q$e3$E`MFT`~c_|gFeAbH?0pgfj%4b1bYR3Q4ym5
zpwq*T7n<|S=|Je;UeMQqo{T>SK;Hm*(*C34HJ~S3AB_QhIp`D2^_$vX40;{t$@mul
zVF~C7_7weF%l+E~I<?Pk{76pj1D*P}*j)dtt?3&x(s&gq?cGaA_lqRu5FbAP{efQS
zgRy6DH|WXq;v&%LDb{3uaw+Kdf{tq9a%Ng5)`0$f(1T_=FQTY@TS5Qoh2(bU7oeZo
z-rbE?Qsd>9NjWP9>Lt*r{i6ICL$hv9^9&t*Q@UrUb6bXI==iPuyhC%Y@9!O2xN*ST
zq3)WYh0}-TOdmRa#!%;sp`($0#?VZ$e=rJrANx=a+{uKiyW4Tnla79vd>#EZr)gb|
zeuo{}kJEqvWmqBL<qxLqNY{3!GwM>nH4hDVDpPwcQ=&>KRq2SkGPPSA{r)pk+wJK0
z>r8EZTE7<uYHex#zRc9#PU{!T)Y{YMJ~mLhH)F*1f!b>s{c!(&#)$g{YG3sGy91Gj
zGarQwN3HRYeu`uGc_@5dzsntwLF=?uYA88kNS8j=aq2nX@CL`^KczX|CrWF&rz2f^
zn_4OpCG_;$$hRE$mIL2%;9CxS%Ykn>@GS?v<-oTb_?83Ta^PDIe9M7vIq-i22i7lh
zi)OsxmvA}9Rom2}-U{g(v~(#veRI|vr*F*Cg)KvIHC*Wy5L;;C`t%03fMV|zcl#b(
zF5=>B6nEG<5trD@!rgwR)BX-!IC>BlSEKDnbBp+AU&~|MJp*JM`x5fPHv{RymcO_<
z2D=3mdwIC~TY-pb*wPi(Xog~si0=t-H}R0wa)DKQJENJPo&&)axwz<qkaR6x>=sbr
z$8%iuBS%p23=Kn7KORY`Csgxy-Gb81an)};q9o{t7fQNF^#8-37>|dwst0(Sf5q?x
zhOaYxpW){W`!A90K8fKth6M~~GF-&4ieVGO>lxm`@BxOuV)z2X*BQRg@N<U!mva3K
z$1yBmIFsQbhE)uk7+%lt4u%gfwDJXC-=69mU%s&1<17u;H8!8`%+D^wSGm=75pJX`
zNr`CVaxPE_ERggdO>4woB|(V^E*A7p47$=kCC^I#lpH1I#+jAKe<jpgmcO3$HEd1D
zaarFZ=z>hYna5Y@-x1EQ;?`oW7WEM}Q<xOh|AWkj?Cgn=)saB3Au_SKus|=!pD2Ft
zExTNsSY2Nmoan1;4AlfCh8t?-&5VU5rNt9h)PySi6Y%}>+=>23qc&^aLg(~yC*lR!
zF3qui4XVo!TeX>vRk(MoMct*iS%Q$}$fO4)($WSv31keQrl(~J1x|DHCq++7>qlq9
z=^6b(by3jzxse}9N(eamlP$}j!!O6^X8awPu5EDx;?Pu_Py}$7rqM$-nU0Nw#?`Z%
z{%|yOe7&qdYce#dEz|KE+&h}_hk9up)4u@6ok(9mU3m3Ufe~HKke%#TO(hhmtU)B8
z%kiUk@wdNU5XJcfMVE;vQK-89mmAR?hz_`eqKiJqZKfN4UPBld(j4EVW61*ugMP;{
zbAn@G+O^sVj*7Hv@b|K`bs!FYgo$M+MH`aYCI0>mf0^+^shDBEpgMH=v@KoASUY}{
zbruK|h9mD-z!6jNXC$?JB(+!k(MD|q(s2>Je=zEYbem1LCx+>EvfyZ%b~PfIj?MVH
z9r<j>ukQH9&oNxf>Ob5u!jYbNTISfyk^NAQW4L4N(2*yk4?S&Y7N(fAG1>_yOhXjH
z$hT0#jC?J9=+qOwb3)0;OHWC|jM8tMRx(oLMu`ji+n0x$23X5H$tVb=#w+NT?YQ(5
zS;cv$P(JtQJ7=8YKo$M_89CC>z5(%;fLj>PlPS?aU^8X=DAAyn4G#1R&u8Aa0gsoD
z{{9R!c`%ULaEA-YhYUj}sr^G`H-=?vHnqcba5zdvQu)KrMqfvq=r~U_34Ei>qvB1u
z^puNF!El<3Aq1FfWzS+@(=?i+9gcqv9fH3Ow3D14M~@N5Gn7$ZIEpc+B-+3<Wy;d0
zInuMTXPi52?wItU&c>$ZEAgP0!*MPWrD<nSan`am`ZeKfKNX`bM@Z8q;%|>+Hceow
z!7Wh6KmAfiY2zgmangk54@^^YK>8mvM;bo+@3>3UafMwS6B{GpiLj<Hyes^Ew!sr3
zjgjDl8k)~0Vmh0M--D}Z3<U9%Xie3Gc-!%ye;`<uU9A|Rj7R$Onj?Kg=7{vVX!;1J
z{CmlS^t##U!)7mm#Z7yW{<Pncd1o4a7QwOfSUEs?#~e`8QTlm~IfCoxzM7KKFJxRz
zeccLYV_hV)qAnP4l1Vor{%T)Xi&WQ#qlpP5ijwLj^XbR!Lv=cM5;)dbsYtp5&Iz^A
z#=77oD}r^waLA9SFYK?r1T)@*f_&$M70wCsbDa}V%_S@98mCNgPN)JboC02B5Q2#M
zBCG4<Z`CwLCN$Qq3e^QBM17St!3c=8!SIUU1pLV81iV@SA+3u_Y$Sr8@1B6)C0h}#
zz9a_;!}X1I=*bFHAFW2*T2QnmGNHCU5Uja`1{uEuHzC*@^lO5D#0&a~tmx{7pt#Ya
zTClFELGy)IG|?aY@R{Z3vQ5&W>r|rd3x|EH`6peeFk~^|#gEa&%cufLOzr<~F|n+q
z2&_%QN{G6+av3_bh3b|ja=ILvu5O`m;>yrIaV-1^M|uENfJ3WLx0Rg8p)FIl=CUe|
zqO8kQS<8*Gma1EGS(o)FD~~f2GK7KsiMn586jsi67Jzk-xD;BMIHl;ywkai7=@b;t
z9w34#MqFg~>2iv{nszor<$tTjs)jMy0M_=XcHGE*uxiIwH@J;FM|(rR2UD2VGavoF
zPK7N#U5nb{GmLezX1da~ewtb@o6uP6_Se)pIS~(!@GQ3(CGOMV``fKU)3K^m>sNE8
zbZwBP*1?JR6O8q8B7U%@#yJr`MBC1MPQ(w@b}x0CQQ|%wFT<!w*-T8wu7i@BL_DO0
zUH3%%2<?b1ex%m9)oo5H?$fc<pvT~?SGsnhvHxMFrE4b{`yPq-lQsIK1M8IzX|`LJ
zr)#HZ>UVg|dD690jeU$n{Arr=q2*?jxKGDZQtEem%*1rzy_hlbK3zMrClZ)+?W`nt
zXA->9#zb<uc1|LKW9b_CR6VbB@w8kbekUA+4DF-tL@ZfXd;DPF9a>NRll@48TphN^
z8?6Be?_B1_UHr<%E#a;EZN<Plz~{1umy5*I4!Xz;I&9>DmatULdQ17&i}XF^>VDwu
z`umVbKU~XXc~<qlB=Dze^p$wJ6#h+dKU{0Fq(98~RTlhLz*9SS`rWwGv`fTaO`CuM
z;oxhpa{3I$)1sBqt9?neZy}bSNWY)ctKE+uAiZ6`fVRccyFd<Yq*iFr$A<;J=lD{y
z?8<+dD$sh`fj^3Jdg|9-1pYLQejdeqeZqM9B!9vs)~qM?umkkLE3935*bREe$YEm#
zCIEj9_&GS>7RAG&iR)5KfKIKazF)=Zw*=iHom2e9`%8@9v{~XuG5&4Fr*Zmaj31Cr
z-06AcCc&2gPwgze)-5tAhv2V_uU;qd;H<c&LtrTVbE^{Z&5R$iR^qRf$u;`87p1@G
z8i^mr_*wm^ozP#NpV1w0eHVC2zm?0O9Rj-Or&b7m>h*3BSM9mFzr@epB=K00i|db!
zf8l!)Ka252;6K%?=A#Q3fA#=bPStfXz0#KrjK4Y{@rwUHVth99;dCzNHO8M)E7Pm7
z_?+?ijK7uBp9Y1Zel@R_=~eqTFn$B`f4(Z8@t<?OMT|ci8`)IOS{_GwR-CSz7=Lb+
zTf{x$FZLS;$@F(Ie=cJD3&1=1-V9z#g74yT9L)c*oM}Eb_Eq1_<@kXY<7*km-!cBK
zCbvlL<V^iB0Z{t%8zf%wXDQ>S^0+Adx(sqM!p5)oKBspu-&B9k!vTq1JEsFr^&VU!
z>z&E%X-ks+Jx)J!g-lQH;L$Yz4W@D~t#^y~Wbqg8Nicrd28nkue%dhm_ACXS%6ZG8
zcRyqNhZgyKp7D?JxJ=;kbF*YQgIFKWVf^)sf3Hrq=L*JO3dT~sSu9uijK7!h<G6gK
zkN?AXs~uPZCQ~`LaywOfRx$oQ);m?sV~juSm+evd^(pZ9-{TrOQkLJwa;5lr74UZH
zH*xwo+#Z!?E8`yz%6hNjn$n^8)c$!Edl&$o^3o5wn6DO2?`Hl$O~my!<E{F8!HM>I
zHxqcO_lYLA$UjRKqx~oFs?CfNAnPQVzGAam#Faii2K;dRQ+{;@XW9$AT{&-ZIaYgk
zF*XkE_^^}lgEz=}$v>m(L&l%Ud{*Olu9yd98uQ;vfX}j-A7=wk^}fLUQg+}$;O*M~
zBBy_1rCSt_o{Q@c<9Bd5N)97vFqQw?HExm4DgHuF8K1*;Qpx8PxS{lKRm=1We<$Ou
zeE19F7qNcb%;ld9hn33N%=}UPnh3mIzh-iJt35x!c<X$Z4~A3u*RPiCT*>urWqgeJ
zgH1Yd{gUzY>0P?Ui@$hApYiXoJkMhMv@>M+4{vY_Od;d{$@q6IdVR&2_WfD|{BYBH
zCyU$jec-A6)jWTp8RF_-eE(Xvh@<=BI?Q;h9UgI(EaxTeuS#FQ_%`Ohl84I}|1#^l
z8kb)){&ODpRa|ejllp~q0nbxG#!myD+HbY1Ux@Ua$OKBymg(29d@8$f596(NX*3Kt
zmA{zBMakiljNixnxryueAI5*M)-5nT#!os|mNVSa{@*aZf%}`y>HiG8oqYb4(_8(?
zTgS-qr#8!a#k-~`>pjMI)Jwe5i_tJ#)UPJSU%~axV7%2&zMSz^IXr0`*^|?I`2Xhs
zPvu+vrJETaW`54#djAT%U4LhtC(CKRMz;S>PQQ=wKW4s7WBezKf0_BO#_J9!nqB$3
zfv0xr4YGWCmzu6O82|KjZV^ZK#dYraGX2$T$MP6|72|Jb|3UdbLoSf%t@BhR@J`Na
z2CF#zxD9TBR{H)L;~%(Q;#I$jFO=nf%KE!N<kGZC#(&J|m3@Ac@w<Yu9EBf?1`{7P
zaXH0Y&c8AK{LL~wvWu&Y@m9a$YsS}xWcqqe-w4I0@@KQ1FJt`RG$}ueA70KF2^>%a
zyYV^;cq-?Sjc$=}9_PA)@%h}}#f<-i@ds;UdS#bJPP8w7ViNp9;Hln&8)P{za`_K2
z-s*o|l_SfipYbza4>R73qx+5|_!C|B?c4@DwP%_|4qr`@eqgRl|250=P;Nj2@Z=9z
z*YB%1{e4_M)KgsTz>na2Gx#6Imo0Uh(OMc5gxYD9ht-U?+Pym%Z?*qJ^6l$&Cc#$&
zPxUSh$o7olR&7X<{w_{$^|$vYNq>;jTi5TGOp^7!%K9>s>)pxtPM+^zF2wZ_<7czo
zn8Wy%0$GlAJ@OjxcKrW}(=T8<sq9sBvMgs@*eyz?Z$Qv>C*wb2zh$mOYky<BbzM9I
zhKTrS^-msU{64lDP=9gFp2F>CJMcE!ReGX==5hKxUb^TT40IJb3Ey)+{6}1lb-w<H
z@!L1Jk(=t`3n#^qXk%3sj+u3RcF_X8w4}U3*R?<}99$8KM1x^HTC4kUwik>bMW9|^
zQBz;(tI-3|`fx<|H8yMh`r3w?U^E!WF38K9Y@bM{Q#wA^3)V%$tF@{yz8t3q8f$A;
zBa11Pc+^KpgUvpwA=}^3py^kXM@wB(8bU#Ta8)Q0)Wun5xY3UzSRxhTsIZ<63pr&T
z&O7V<J~S4!uaL+Bobc9%ajJMlzDuWL#%RbdYQ<4zXbocZb-J%Epa+7HD3wg{oKOhq
z=S|Ud9J%WL=H}8|<i)wRudyboql%S!Lw!xizuKeevr6aBm|lvK8!Pn^z1D}1;ptag
zQR+h5aKswMck6T?N^J`24NachoSxWd*nEVp>oXTjpX<>*^NKMra+K6P9%<4&UQXaG
zUSL!{f9A|`PlaAFeMYH=GF8kiG71Pp>h)@Lu*UXa)pXJ-o@CJU;>+hvpIcHy{hw1F
zEszJyg@MMfU{y&?d32JvEh(*Wc@T2c<?(nXi7)qgnkaLAU45-Dud+5e37_!`hKpPh
zJtxjoS6*Fkm2M1K$vm6t0=-q|@f6~_gE}1=%fic}g`xV0Ug?Vj^CQ7%m8U%F%F|1L
znpqpIs`0Iel;?my5FHu_j86S3T^@z&!Em@fTsFt=cF!s;nNhTOv7VQmPyHybuZeg9
zJ<(BDfzfn58uo>vk&<#>UpnELVu*D~dA>e-!5sVrH;XI$HLhH}JTCQB4SH2Wjvi_7
z*G#Ue53llt10@wXx~HnaMHv>9PtwaO^x}s3<<qH~U?}o03i?SQh$($&9l9i1V~pQX
z4w6M0j9j&$61y?=cxuXBx{@hm_E6(pur{^OM|!@X1dZ3@atLWh$B}B1UQ*%GD{_ee
zo_tE_0Rszl{Fxz`KvFC}4tX5iFZBBweEv{$H3^#KaRk9=Ruqyer7ODM7m4a2ShK{Y
z_o_mk@?60Nsa2$49#5dOGRIR|iMs+_6bcr~RIGs1b&S%2IrE5#%o-u*7=OJsM220A
zKj}j)zNe`ZitJ=bSs%1cj*UJyUT%f%apkEp#c+Abr^H3Dm~3nPYMMYwL_c5zguSg_
z-M0obRp=OEDdIg$l)Zd;id-;WW*KXO#(-lvW{#y&W#YO=GQbK{4yO0CO2^)nfM|V%
zlyJk=x^lz>7r`7C$)~TiveC6oP^doGDiW1P4U<A#>1p}x*ve8$@;ARiUqIX|2nU6P
zg@aY_D(d{flJdzqD`tvqQZdgdsZj9ZTs4Ipzp-#lvH9q1iDB29V4&r|8C-@{gmuf)
z7v;Kg;fDDFtH}%VuY{(VY~D8*jsjs1Emm_j%p~k%+%%#@R}M|daAENL7OVv{Noqoo
zUQ%AHLvcOuYiRah)pF$}n6yM|W+xUT4>cweMR622C0Iv~r|F@plEslQH6#xXT*NS#
zXbXIZCSOfs&_j(<)?8XO?gz~83YrsyyFxC~>?VByd#^<`zDR^zS26XaY<`}sY?YV^
zn|PW<ErFV{COgjnCMCrtS6I!-@CC?>d!SJX0~DVYlZ*yzU&v%Ipr+;~_F)d78OalG
z&m?Jw42g+1sE5o#6wGd-k>!|$EN*)O<#`aCV-+yNDyeBRE@1v<pB@eh|IqD*FF1Wh
ziSEkIOW=8*W_nTtlU|?;Vs?ikP5g|{?vv0Ty9phAjC;5xSoqMa(nA7mtvyo0C5qJ6
zCiJ4VrX*k|GjZl7SRm5b$?!p>TNVyBNY{*Pj1HBCT;)Tb;m+cd-O&(qy`;QAN23!x
zx!G{qfLAt2sXXhuuvTblf$(SGzEf%}h{z_vafd_GSVzkiv3?5%P+J2$ixO0l0@nt$
zqB@W05DX#tk??@2Ny_b$c8At^%y%8HnBe6eX)ur5(JYelP=cE<zadyhkKmS*mZ6`8
zJsFU??wk8gol1@SY+a}E%tGSV%L*?*T5XyDlZhYAa48c%T2_!rFiD<}MmfT9r|vix
zRG9nb!U^UzWYALveV4k~r$r>o4RlShw$BT#*$F&9>V+0WcEJMD!o&r)oT>WMCzDTh
zOxqCT>6>S}2-`lZp%jgk_CvU=KCw9I!)z!_1{R??I%Z~=5ZDTuLVVF+b0}&x6LqU<
z%ft?ed8m3!lhOc-34%N+F})>P(L^`Q7^ZyN_3&g`52t)nAg&Cw!if)8;v7Y8x|Q{&
zy|ol<1I24Te&<~-DoQFyIn}Nx_S0xjw1T`>HuO>;kH!h;&A=si1BNS*u+S&h#?BQe
z$J+#C^<DJ3irRqk!*r2U!!}`;vmUGNps%*X55F2BYP-9Y!fh=0l;y^<UbxDzMpmD{
zKzG5-VTW3;4%RdT!)Cv~NqPO$U?H)I@7mBb8?lur^gC6XUGaj%%H3ER#_`8%{Ee0h
z#WB{gsc>On?7+JOG40(QPcWAx`1WG22YgW<zo&yyNa;Rqe1%1xZ&MlXYm`SO;kR38
z)2)a7m}BeYCax?<J8fJgv6iBxr=*DqAXWYoEv2RNI74T9n-T{q)a*%g+<RRv^_Wd)
z-C?t5XrrHfT$9@$7W8WR62~eflAo(2-(EUgg*|_di1)ty^}-o29~B4<qRs!>9>Qmy
zx#2AREB0+@U;lXbZQ^QgHxLksqTHj~5<UJ|bxK2Hq#BzS*wTU{k}u4Y{U8;{UzOr<
z!G@+kgfqlYRmhK-&2TJea8hc4%x5r@#2E}(B4LVXyG|S%L0wXJ=!3b$bh49h@MEKv
zm^N{sXq*$pC(hZ$4QL!%%KK)C^JkGb3@YZ?Aa#w8pSxi+d9ood8!iSAz7oI3sx#%$
zd>jrcXV`SSrq(OH3#+jofywHdx4!eR+?3n7wzCA*N@yi-h1g5sWd4CM7DL~>q$`^g
zdL6WFydoIYD;nabqOu!igJY-5ialiGZI%j2R&?L6REQsQ@ew=ZQY|k3H`eW9BPjW{
z(KoEy3#CsCml!WQE5{fLZSkLl@Zzi}esrvMU#t?HTKi+Se(y1jfAgZSZzG-B?Q)k3
zmQ8K=l#qR}-JF#x0i>hz`?x_fELp9w65w-OTV0-F{Ia0Z6qhSnjZH}T?8Ox0Hv?@q
znqW6QP4>rK@l#YX`*{3A>@(6nf!HM=Clb4n1z|i|QD2KkJp92(#5h*fqrMd$VX(0`
zS&NmrQk<i-a%dKO;4(VL#wj)*rP}siELUrTI3{ljvV@v9BvWVd%@ZUHhnU6D%v3z2
zaQyDYWd7Ah=_K2m^jw$W{b3%hnlec~^C88?K8q{2XG&uf__>z&;xJ|BO+8%$gWbpC
z8k8KRYoJ#`a{1{Waq4&6o7naTphhR+2cRi+;R=nt)Y`H+w8JL0Nc_ocyv;_5)OqaT
zLAoo+C4=oEC%lL0hdv)Sldw9StLFKjcm$0sjNL*M4S}Y?3e@7TRvJS?3e6|}dA>kE
zY+dK*QaF#{7`L~%=3l;c6qBy)rjBhtU3v6#uz03LPUfCUG1i58aS(_;K2kzYl+ea4
zgoeDwzL@L<b5I8Me=Uczy{^>loH>lOb+NAeJKKr0&r9}{YM<OU)f@Pk$CW`m!hqe!
zC~ugiZtKitQX66Kq2M?a#EGDK%Bc@K_;h-VNq%0u5W7d3<jBV73aMq2gEhVed<TeU
z5)_nzgUEc^VKhoq4`P@f+&J!y2al^jJO>iNr<{U%u%X^xt$Zl1?_a*XT_aBJVc*ml
zynL!Ju7<U+XV5CFm2>4Hwr>YX#s({Q43Fg5X`WKElrE#hlzM!}1!NJx0D3$UO-%Ms
zp!JCxJGRQlS<7ZFD6cQ?Ij_O?<*RK`b-XT4#zIwkU45|GA8f$)n)>p{9wt&4c`=d6
z?NK<km}W7#im`T~TEoBAeYvSSh{MLD?HrPapB5)+7YB9t$P}KdjQXoBPkTU>@w9_{
zPPmdD<fg~VZT6y5;SRut&DH(YK|EWIN0L2oVepW?Np}8Mp49k;B{w}+23N`cNsWAP
zwWh?z{jxivV$Z8Ux42qjcq;LDSDozuk<~t?g}tqlfkMn{Mq2C{%SM_Q*l5JpZathO
zQc&@fnpp=;t94JJ3z_XIWD^&vOQEOHb00ddu6~u}@Fd$ch2#pS5~-V#lWklD>Bxv{
zJAQ{b-}InAZ82h##;S|@isoh=CyJ5!I=tNxU9C6eSX4SboPC=E`g-baYlKYa)^dVK
z@^t(+%u9W26COmcXqyy^V^XL*xZFL40Zz294g%KmQILh=Glh6yGtyW~`xQk+c+Vn7
z7`i^q>?8wP<m#~6SWejcFb-Ld^R$-t=EN4Z{R%BERdFNHn}b2~Id(GTR45)Z6ZYV^
zmM}EA%2kYbWDVP)g00GW_0^Xn)5g9>{B<AWQH6x378Bk{P%?|$HtroU?6_CZV(@8>
zq*FTbY2xQYc&b0q<O=@g(c2Kdl|j9SQ={Hp=#y{L!{HZOo?>um2<#n@0$#i(J_=M5
zJqpLPlDuWpV{yDy$q%i;rxI_&$k9)kkyo}VrHe-MrXpVCfiGcu>ZX=I$NC(4Tn6Pr
z15#&w9;MgCbMi-oP4Bna?!YLmJ6dx<gOyUwD&0tFp{|^S2NoycfkiDlvbq*uodt}B
zC9GC=c)cJPZqTyp>Z8GIJVTAI)2`qc-mA<OA8(%!3TW9BtM)~zwd}y^I+Q0NUW^nJ
zJWCwGTbT(F9W>g`r3`%6P!rX%#Yt5*ZnIa^BZ>#`@K<~^HalD|ypil+HNSvb9Y7%}
zhSL3hotHS`E4s2U`5LP%L)~BHM@Cfas}1>)uRcl@0z=MDHa1hWY<fT%r`Es)D;rlJ
zpRaBO-b>(UsIIDB-TNvl!@(vMqc8iadvH<RV%4E;xg_;Q?{PtT-=AJR#%Bh^RhZ^x
zC;+}(k$CZI$_5gj+t3CesQeXaZp83g#*A=^So{ya&TJs?I(|a_XqsElqMUz>h{gX{
z^Ls(+hoI`;k>*BB!*@>w&`u{JuKp4jzn-AVDF*&?{HBJ=ufB`AhdPC8RFeFXil0rz
z%?Su9zxsY^$6%QvhhIjg;)r*d5^a2kb`j#~2r9qvomG72g)a3wB{C+<Q1=9vA>doI
zM*G$GSPw6hsW$O|ss5{a6kLg9^qV6puD;va{9T!Uw*+F0h-a$(jMmoRhVUxC`o8P<
z#WDpx+aNAgzsf(F@tXiC{|t_+-%VM}`PJ{NP=5NRhc*9Az)=@f`RaSI4Oaq*OMM4K
z#$;XUo?r`tIlua@>~_xYR2}0?Sr&MkCI4>zzQgXNGNEcOXEWNb;yW$*)pr5gIDam_
zYhu1s{R($4Zp`gh-?RPn2AN`@O309SZN7en2<11rxy3EXTO>Flf$`VcpNBv(=U3my
zJ;M19kZ_wWYyHoFX3np^quaSvW?EwDu(kddE&0{=b}uHz;_4KC<A1F6zX=k3>q9kD
z{VvhI2W1{h^*zS#FGx1blwbXB(SFWvZLc-{3G!0^RsHHaznw>!pT>t|U*%Wd1CFWo
z^T3k*p-a`T`n3Z%bN%YO!AE{6k{jt|ES_JLL%)k)&aZwaN!u-RIEiSwR6Yd<S@Nsz
z5Iae*aH)36SiF7}r|)!8S*ri)d&N1Nf3&53r^?9iM6N%_f;aM8@Txck&*c1;WLg>5
zFPLK@TKglvhigi}w`C8@o6}Us7_aJ8Ii@73f8}E`1^uFl`LaZ{SxNH$^;a_2`<4ur
zs8-JTt^9j+kIdg{7#0~<`4wEo`OR&ld;Lj?IZY+x1eT~4N|Jy5Z)AoJ&ZmHt|HDDB
zuvjELoj(A#%2J^vDu&n6MUlVkDOqFlXbH~)C`M3n&uH=cUv~BHdrju6m@1+AYpwtP
E12BGJfdBvi

literal 0
HcmV?d00001

diff --git a/thirdparty/bmt/build/src/CMakeFiles/CMakeDirectoryInformation.cmake b/thirdparty/bmt/build/src/CMakeFiles/CMakeDirectoryInformation.cmake
new file mode 100644
index 0000000..c883c69
--- /dev/null
+++ b/thirdparty/bmt/build/src/CMakeFiles/CMakeDirectoryInformation.cmake
@@ -0,0 +1,16 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake
+
+# Relative path conversion top directories.
+set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/bemdeppi/ham/thirdparty/bmt")
+set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/bemdeppi/ham/thirdparty/bmt/build")
+
+# Force unix paths in dependencies.
+set(CMAKE_FORCE_UNIX_PATHS 1)
+
+
+# The C and CXX include file regular expressions for this directory.
+set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
+set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
+set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
+set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/thirdparty/bmt/build/src/CMakeFiles/example.dir/CXX.includecache b/thirdparty/bmt/build/src/CMakeFiles/example.dir/CXX.includecache
new file mode 100644
index 0000000..065d1e7
--- /dev/null
+++ b/thirdparty/bmt/build/src/CMakeFiles/example.dir/CXX.includecache
@@ -0,0 +1,36 @@
+#IncludeRegexLine: ^[ 	]*#[ 	]*(include|import)[ 	]*[<"]([^">]+)([">])
+
+#IncludeRegexScan: ^.*$
+
+#IncludeRegexComplain: ^$
+
+#IncludeRegexTransform: 
+
+../include/noma/bmt/bmt.hpp
+chrono
+-
+cmath
+-
+ratio
+-
+string
+-
+sstream
+-
+iomanip
+-
+fstream
+-
+type_traits
+-
+vector
+-
+
+/home/bemdeppi/ham/thirdparty/bmt/src/example.cpp
+noma/bmt/bmt.hpp
+-
+iostream
+-
+thread
+-
+
diff --git a/thirdparty/bmt/build/src/CMakeFiles/example.dir/DependInfo.cmake b/thirdparty/bmt/build/src/CMakeFiles/example.dir/DependInfo.cmake
new file mode 100644
index 0000000..2278187
--- /dev/null
+++ b/thirdparty/bmt/build/src/CMakeFiles/example.dir/DependInfo.cmake
@@ -0,0 +1,21 @@
+# The set of languages for which implicit dependencies are needed:
+set(CMAKE_DEPENDS_LANGUAGES
+  "CXX"
+  )
+# The set of files for implicit dependencies of each language:
+set(CMAKE_DEPENDS_CHECK_CXX
+  "/home/bemdeppi/ham/thirdparty/bmt/src/example.cpp" "/home/bemdeppi/ham/thirdparty/bmt/build/src/CMakeFiles/example.dir/example.cpp.o"
+  )
+set(CMAKE_CXX_COMPILER_ID "GNU")
+
+# The include file search paths:
+set(CMAKE_CXX_TARGET_INCLUDE_PATH
+  "../include"
+  )
+
+# Targets to which this target links.
+set(CMAKE_TARGET_LINKED_INFO_FILES
+  )
+
+# Fortran module output directory.
+set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/thirdparty/bmt/build/src/CMakeFiles/example.dir/build.make b/thirdparty/bmt/build/src/CMakeFiles/example.dir/build.make
new file mode 100644
index 0000000..d8157ce
--- /dev/null
+++ b/thirdparty/bmt/build/src/CMakeFiles/example.dir/build.make
@@ -0,0 +1,113 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake
+
+# Delete rule output on recipe failure.
+.DELETE_ON_ERROR:
+
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+
+# Remove some rules from gmake that .SUFFIXES does not remove.
+SUFFIXES =
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+
+# Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+
+# A target that is always out of date.
+cmake_force:
+
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E remove -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/bemdeppi/ham/thirdparty/bmt
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/bemdeppi/ham/thirdparty/bmt/build
+
+# Include any dependencies generated for this target.
+include src/CMakeFiles/example.dir/depend.make
+
+# Include the progress variables for this target.
+include src/CMakeFiles/example.dir/progress.make
+
+# Include the compile flags for this target's objects.
+include src/CMakeFiles/example.dir/flags.make
+
+src/CMakeFiles/example.dir/example.cpp.o: src/CMakeFiles/example.dir/flags.make
+src/CMakeFiles/example.dir/example.cpp.o: ../src/example.cpp
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/bemdeppi/ham/thirdparty/bmt/build/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object src/CMakeFiles/example.dir/example.cpp.o"
+	cd /home/bemdeppi/ham/thirdparty/bmt/build/src && /usr/bin/c++   $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o CMakeFiles/example.dir/example.cpp.o -c /home/bemdeppi/ham/thirdparty/bmt/src/example.cpp
+
+src/CMakeFiles/example.dir/example.cpp.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/example.dir/example.cpp.i"
+	cd /home/bemdeppi/ham/thirdparty/bmt/build/src && /usr/bin/c++  $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/bemdeppi/ham/thirdparty/bmt/src/example.cpp > CMakeFiles/example.dir/example.cpp.i
+
+src/CMakeFiles/example.dir/example.cpp.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/example.dir/example.cpp.s"
+	cd /home/bemdeppi/ham/thirdparty/bmt/build/src && /usr/bin/c++  $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/bemdeppi/ham/thirdparty/bmt/src/example.cpp -o CMakeFiles/example.dir/example.cpp.s
+
+src/CMakeFiles/example.dir/example.cpp.o.requires:
+
+.PHONY : src/CMakeFiles/example.dir/example.cpp.o.requires
+
+src/CMakeFiles/example.dir/example.cpp.o.provides: src/CMakeFiles/example.dir/example.cpp.o.requires
+	$(MAKE) -f src/CMakeFiles/example.dir/build.make src/CMakeFiles/example.dir/example.cpp.o.provides.build
+.PHONY : src/CMakeFiles/example.dir/example.cpp.o.provides
+
+src/CMakeFiles/example.dir/example.cpp.o.provides.build: src/CMakeFiles/example.dir/example.cpp.o
+
+
+# Object files for target example
+example_OBJECTS = \
+"CMakeFiles/example.dir/example.cpp.o"
+
+# External object files for target example
+example_EXTERNAL_OBJECTS =
+
+example: src/CMakeFiles/example.dir/example.cpp.o
+example: src/CMakeFiles/example.dir/build.make
+example: src/CMakeFiles/example.dir/link.txt
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/bemdeppi/ham/thirdparty/bmt/build/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable ../example"
+	cd /home/bemdeppi/ham/thirdparty/bmt/build/src && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/example.dir/link.txt --verbose=$(VERBOSE)
+
+# Rule to build all files generated by this target.
+src/CMakeFiles/example.dir/build: example
+
+.PHONY : src/CMakeFiles/example.dir/build
+
+src/CMakeFiles/example.dir/requires: src/CMakeFiles/example.dir/example.cpp.o.requires
+
+.PHONY : src/CMakeFiles/example.dir/requires
+
+src/CMakeFiles/example.dir/clean:
+	cd /home/bemdeppi/ham/thirdparty/bmt/build/src && $(CMAKE_COMMAND) -P CMakeFiles/example.dir/cmake_clean.cmake
+.PHONY : src/CMakeFiles/example.dir/clean
+
+src/CMakeFiles/example.dir/depend:
+	cd /home/bemdeppi/ham/thirdparty/bmt/build && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/bemdeppi/ham/thirdparty/bmt /home/bemdeppi/ham/thirdparty/bmt/src /home/bemdeppi/ham/thirdparty/bmt/build /home/bemdeppi/ham/thirdparty/bmt/build/src /home/bemdeppi/ham/thirdparty/bmt/build/src/CMakeFiles/example.dir/DependInfo.cmake --color=$(COLOR)
+.PHONY : src/CMakeFiles/example.dir/depend
+
diff --git a/thirdparty/bmt/build/src/CMakeFiles/example.dir/cmake_clean.cmake b/thirdparty/bmt/build/src/CMakeFiles/example.dir/cmake_clean.cmake
new file mode 100644
index 0000000..953ec20
--- /dev/null
+++ b/thirdparty/bmt/build/src/CMakeFiles/example.dir/cmake_clean.cmake
@@ -0,0 +1,10 @@
+file(REMOVE_RECURSE
+  "CMakeFiles/example.dir/example.cpp.o"
+  "../example.pdb"
+  "../example"
+)
+
+# Per-language clean rules from dependency scanning.
+foreach(lang CXX)
+  include(CMakeFiles/example.dir/cmake_clean_${lang}.cmake OPTIONAL)
+endforeach()
diff --git a/thirdparty/bmt/build/src/CMakeFiles/example.dir/depend.internal b/thirdparty/bmt/build/src/CMakeFiles/example.dir/depend.internal
new file mode 100644
index 0000000..de03e59
--- /dev/null
+++ b/thirdparty/bmt/build/src/CMakeFiles/example.dir/depend.internal
@@ -0,0 +1,6 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake
+
+src/CMakeFiles/example.dir/example.cpp.o
+ ../include/noma/bmt/bmt.hpp
+ /home/bemdeppi/ham/thirdparty/bmt/src/example.cpp
diff --git a/thirdparty/bmt/build/src/CMakeFiles/example.dir/depend.make b/thirdparty/bmt/build/src/CMakeFiles/example.dir/depend.make
new file mode 100644
index 0000000..ed8b29c
--- /dev/null
+++ b/thirdparty/bmt/build/src/CMakeFiles/example.dir/depend.make
@@ -0,0 +1,6 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake
+
+src/CMakeFiles/example.dir/example.cpp.o: ../include/noma/bmt/bmt.hpp
+src/CMakeFiles/example.dir/example.cpp.o: ../src/example.cpp
+
diff --git a/thirdparty/bmt/build/src/CMakeFiles/example.dir/example.cpp.o b/thirdparty/bmt/build/src/CMakeFiles/example.dir/example.cpp.o
new file mode 100644
index 0000000000000000000000000000000000000000..9c6fc5e5d73fd03928702fcf95b27daadfe3d71e
GIT binary patch
literal 87416
zcmdsg4R}?>wf0U9X!N2)MMaAmBPv*=35gge)$nulL;|J;6cs&$<N&GpYjT2tLPe35
z7=rj)uEm!2Qa@`^v1+9iHC3v3TPs%Erd~dq+gfibe0{i<R{9j>TWe;`?6qglB<Cat
zz0aKoGJC(X{$|bp?0sh65-q<hJ1fgmA<MhO^Y0Yuc_rCCS}NdDZ&ub7f^!+%W9o8>
zze4c}if^HK6~(twd>h59DPBYI*C}30@f{Seqj){VcT#*8#rIHrFU9v$ypiGuAl^jR
z4^sSXioZkgcPZXN@k10pO!4<AeuUz!6hBJwV-!D5@e>q3N%7MZKSS}4DSnpX=OBK8
zuDc+9k*;5&cqhd#Q@o4fS1JBE#jjEPI>m2L{3gY}rua7$cT@Z}#d|1zhvMH;+(Ypn
zD1MjX_bBeA_<f4^QT!K*_fz}<#Rn+<JH-bn{s+bXr1&F>Kc*Nf9M(CkmKYDA_$Z2V
zC_a|rVH6)vaW2ItP<$fABPc$F;*k`ePVp#;&!jky;(UtFp?EaK=TSU{;tMDqNAZOe
ze~#h;iVG<&qPUpii4>PmJelGt6kkGdDaBJMo(A#USo@zUW4G>m7_Qr&tNZxy;aL0A
z3ov>m+Fb)qgn0K*j62s996o&bd7$%R?XO^&DlFFiGFDGdV9$#LlR(p72{L^4`^0VM
zno^V;VC<N+JJvC`JJ$I$rh)P%VVagBP+v<x>?=`W$=(ZrEib4~+$v$e(T1#!_GZO8
zqEcnn?%3y}y*IxI-0>i{xb0k10jSGbOasQQptS<^trc?A?JLn9%=z9pu;n5*CA_kx
z4(?8aq*&*DcnRqXM7nRcLX=zX69SZapKf)u$H-sL%^N^olMs8K^sP@$34Il-;8a=Y
zjIIJ3WuBQ7K>HAZsmks^>3%T$gf;Wfe;qPvk8X~2!hfO;Cqqu68^XyC3L&bnNtN56
zc0vEa@LbX=83N@TxG_pOY*rVT)fF~NWojEFZwn_Ys{=JCb5`S+0u{=vpuBapcmMep
zNYwrk&=rUpr<~WV?3Ib7EOBo{Vu6;}ZZK3>7~&k8B8BkUwReQT{OY1KaSths=Of0n
z-?|e!xa)6t3H!U)>V3KGD-X4|9$dZhP*&^tx2`;R_^{`-p6T~uKS4K%;`Xsu1@rSr
z`U3WXz1zTYlGwiTV644$f2<=4|8pUN+gz50K!<n3OQOGxrA&=hKxn9{Pk6_8RZA1i
zRf`i|eWIqes==$Th5uD;-qNaOxUEijEy<dAqPe-T*=tVJRV8beCgMI+-Po{b(nV%^
z0IhCpZAgN;WG#qZ9B%}5OA=KzUQ1mf(G*_<a`VW9$$4I0LyMPJ-{LJzR3{soCr@rp
zv?Q9BCiH!LRy<MPR997<h%c<FzA0YS+J;%l#y^43b1B}gx_$gq0hiucD$0oYNcvHh
zSjXH$;LEdjhp}UPtRs5pL%PGVMt`@r9>7xR@H*xmX#b7scsk}DY|lnqmTg+Z+#aPH
z1muCX-uBja+voOx;NFhXk?m11xyM)88(aP2$o7|G?Y|YnRL9(&j_AAGCX*m~C}<uU
z{1cqqJde{H-6VB3D#47AXhy8#hF&>Pw?|^DcaBu`AAJkdqDIOI!0LT5VAu|IcpIc_
zhm=@542PZYUl|VFK?ihLBBcjXdUVRzmqb&G?uvEJh5y*;`J+Gw+^*gkQ>s_*D*#2^
zplCOUW70uhM-*_?)j&h)1Q&Kf>`Nm*u)2ZvKq<!Bv#GUF34pL(CG3OP@C&-2e})`)
z0%=er5Q2ti4|M#|p0J;_3}D|+wEK~EzobQ9mq3nofv<OwuOTbKJ?|)H)0b#V7YGAK
zMZvIK;i3|95lY0u*abcUg1Osc?X$2@cEw&14Mbda%-t4iugKYsh>(uyofWyD7<{Vh
zmP+pyOq~@YAsu`iN=M%#f1WfvlnymXq>q7gaH~!i@anDtol!{^0~|~bl66)T01Ua&
z@ZKWq*^y;+7vxx$yzI#(=*wkYxvWfw4`-e7Y!*yRf0Z?KO}6*vtfBj}yl-U#pn7<4
zp{(UhI4C;^%CuZv-he=JX>UbG^e9uXn9y_JF;N?_{DDZ<ZB`p)YeG6vH*{UB&;^M(
z3b!3p0j&*d1C}Gyd901n6Ci2U_78^7-?eD?n4d~pI&nsfC9G`#Ubu|{U2b%{&d%V}
z&O>pS^_0|MeVqr{gpyUAM`=!MO;_?XunqDCH=PwQnZu&5$ocbT?6UAi)`xAZK1g~G
zQgE^g<hewMF%arZqje7h_AunU3mgJ<O}Y#`1ztw29$3(Jv}pNzPmZ;}7<=RW)i4J>
z=0${F{9yPzXm>aE=UFrobl(7Rpjk2f)3M2E{W2i{TpN8D(Oz&S{Kv|rvq8BVNg5zY
zHw~@<(q)cTAIxgagWe9ZM;-Ar<95v686*@E&}CYnWb<O}$RfNy#z`>-I_!C>!&a>*
zIC~NGy)>6qc-8JoMU4-&Uu<_P4|=WV`dyN}cK!JW*rkDdEUq<Ot+(KH><#dps6^OT
z6mTEv#v;2^K?=pK4e-6KDd&k!_c-i3KNx<kXj(YfVB<52r)MvEs}Gr?)ygj1><vou
zLf+B7EFI%agWgK_9v}e+ai~0?2TFhGmEvY~*GQwnO9EKKrc&vlZq4ceK|Onis=_$f
z|G=oq*SCq8fz?%rglzR-SJWehLTZk}x>PJv9bF3MP%mGhYnv`ze`uv)SoNkdVV80p
z+Kep;`k8RQn@aDn8yEvczj|LOnA1&jn{G>&4(fJ;dva8@bMy*5AbA(MahLAV(d`F8
zGK@hmg%NWWU9&MKvYNC<dy%a-z(z^F8&pZ{k8PiS=Zk+EGym|Vpb(voO#rk6Br9rJ
zu3GZ{B+ugp)><(Yey$BR4%+vxU6X|P`5b8BFXX}>Q5&IG=v=c3({=t<J%@B&@?1)R
z&Luan$Pe74L=@oo-*yx0Sc49Zb>4@5WEEwD@(ne^y=TaUyd;xVkFUo`&aFGK*kGuE
z>D=%Vn7m#4X!gd4w=F{RxT)7ny?W~0sdKP^W3Y6LwLiD(BUJo!7eu!5-3_>3zF;pB
z^7-ZJ{aLqnCC96j&WaeAj$BywEJuN&Q`D5ZSo@mo$U)Nmu4m0Q35%#*Y-@sazD)vc
zm53fBT4A75gxE2WTaK*b5+cewzL$~+v1u6GDO6UJcRZSs2(cfdb_I1uLRLJTiU`=g
z!CgWlpXD9TrNj)?E4ahw;CfiPUqqH|LrOM1zFBgKs2yfa#=aaSn_L?umxzpMD7shR
ze$5J0C+MSPI=Lg1C{2-$4kgOmITD3qY+}pXg{x6@A2gyY`=Jq4rXLznGy9~8wTlMO
zH%-UWqMcEGpuO!vr+41B8zaA&WA7s&=@r%ca4gvS2syXz#6IQvsq?4aF!jc%aoG!9
z3SH0gCystD$NP1TT21PqH@d&W`v5i*VlXL$LWT`Jnup-<>>OuPpUyxBM2l9xp}^!s
zZc(Uh2RRdCHsM60-#w7wxisOuQA~J`m`ylc7PDft681dk+{>|ZFF)~t9PjR|p+C>@
zwq*@{CdXTqJ@nOMysqq_hjP4kvWNEOc&|ieJ$;P#Z$nPpaE$lsAw%K%y&)%VImY{N
z=s&XH=7%{?!FU1k{Qr^DvQ9W3gwGy&ZC1;%E4>cf6aXR!ha=}^ot{6+^KQ$!__x_v
z?_o+uB)U7|y@LacBm%n7jsZDe&vin@f(uHK(O8a*VPyKH>f8jPMF{D*MusUrn;TL(
zJZaiJ{O&Fok&g8@nn3_eB4t(91jBE~$ESfLnC*#sTw*J7v7sWTo+O|hOck4LAoF)V
zotUrtYLE&yJ4A*gKhF>#xY{EkFE;kYSo`kSPY=STKVj$y5N9yK4xU)W!$fz)&#+8@
zswb8ud0;z0D#hvx(*6+fQ*vPqI>uOo>eeR9L+-a9jCBYx5QBx2UQ8B(Ot?`c19RGd
zQAn2qi%D5<V8Y&$Ks&avgRDo<MPC{yHBqrc%ZgpVyxTI{Uo?Dtbl322bZKGC?$9J2
zs=!eoXaC+IXo{7^G}6kVIuKDQ(9e)zR$k;ozxNQy=m6n<f7jg7qp=<&Hhf$ctr?&v
z!96QIr^ZYe<_q7->Znt~^5rhAQX<#o8SbX2r5A|ay6@dSz7MCi0Xtv|r#)L|p#sFf
zauRlF``p}{i(6Cy4`pVPFYH0x$Aj|*UqYoKdziI$)s;Zo#Z7sE=>ms=5O%=iZuWke
zRZS1g0$R4%Q`PjZ(o-;cG6Sj5@{HP%3%a3Z_L-u4q+ku#eKWOi-Xk$Z{wNAsbkBl5
z1)^3cE+}hTzu+7#j65j&9aaW3q9*7egOx-yBRzV=z_4zyv(OIpNj?$i7S3fYPYnpV
zynZSKcZhvi;G$xI4W+#ovpXrLV-{3y?JRnf@e;+LnwaJGyYfQNU#nN{_mZcIV#5q|
z%mO!RiMrVYdDwFrQF6c|irwJzpR-VVlTT=I4A-W?-V^Zdf;{)|{-_<tH5{A7aM|UP
z%V!TNWBvfu9qSyHEhM$S%`yOQ!boofngqBQRW{|w*aPjpvry&gXs=9DirTSP1{N_w
zK@1ttnUyJOs1=E0gF`J~IfBY$LbbV)AG;|rz(I4>!)~`6T%k2r<o+37o70Y1y|UM9
z9VdlSDcjg|1Kyh-pvC+^-3bl<U@7($v%oH$zdov!dj17drBLe!;RpT4UvOEPLz4--
z#vL9n^<G78qER)|nbOrWq+Cg+7Hu0ragvd%p6)CAQIBX5rwj9;HHbRPyg?hVT6G>Y
za*)*8BYL0qS<tbGW`xxmaP389k$l#9(C@GOK`S7WQ$W?AG8X05cWZ;9gQ;UXJk=GY
zs~EB2KoZ^J<KMRd*$uNVIhP#<cTmqLozv3vR3%}H?Suis+<5n5Ss<ULw~82F$~rNi
z8IYFaIU*)c(ByWi%A_XVkZa*pkcuysfm$V1SmZlJy?55`KfGlVQC_sR!J(g(1k|8l
z?a`w^u5-M@I&qMdFkF*gKOfyM-91!wOE8J+ff>0>57Yp^R`pq<!tM$c9(eBM7qIn7
zrSO~&rmZJo*TejzOvYv_UWb8d=0WvnVN+Zq@2*y<$7$xd?eO9ahO@K`^iqm>RQ{B_
zD#ws5QPXj#fErIO7L@~%R87V{wvUQ|jR~ya{>Qyb^-6-Og3PC7?K}IZjpgls2<pp&
z%?h38T#;usm((SWDd2d^?5O}!bX#~idKezGLY_Lr^B}-OJ&clrPya;aCO{hz5GXr*
zXtrbhF$;{q2Y<=4ggts(u?-k7+2^CxZs_KM;qvLzC+Cf+oLd>qE3a*6Z5x|cJicW7
zMT%$yU1Tl4%FAlY&HBu+qjNUE3;}ik#D7f2Yj5iDxnIkke%w*l00CiB5QlIA{!!`W
ze)@dGA-MQg$Ydw~CQ3gMV<mr<FMkK6i#(|GEB*96{iGkF^y4vB{AE5rzN3ucLj0rB
zm-y-Ut}q5y`g}@v*|&nyZ$u$Ve)hFO!6<+>QTiC3UhV7OLFsusJ-Rw_HIN|x9!j^B
z5AqzM^e-9w*$aFfdGzvGl+$FFY8y)_9rfuvusX7U(r57WTYUMeD197HUo6w}0NYII
zSMhYnV|L7!fk|1*E6PO;K01Q85mk>z!wvq&fbWbC!3*VUkQ2$v(s>ysSUqn%r>XVz
zjHC2Rc=|Fwy`0id=jn6(^g2qvnWtwrYyImey)lsfC6S&7*j7q^2g{A;;VnN8uu&(1
zt<15E_EY+2kV?tG3In&8o+}(Ae-EJ3vp>={7Erp2zoMV?CQ5g;=?#>Qzd`Z!uh;sw
zQTqBodb>{FP3d<A((lme`zigpK)Ng>k$>3a#BeeGQT~xdB>YoA=`Q{XO1J4B4I;Nu
zeSX=ji!rE+KbGZ)ZK0d!TEJdmva+S(&dB;9okQ0hwfgADjYD+4c9GsZ;MI0tFOr4b
zy_9}APtTsF`E!oK1kbazL-flSO2=z0pSANCrO)K)*_UbgOZt(&meOr)2mRJd^%l!7
z;1BfMAALXU?n6!<$SDxao;Mw?19HA9eT6h5j)p%F@HZejuh>@<`|YfRXk25*JW9U_
z1*x*jUZnleM(MKx>9SKpA8(}eBA(tLZN&8L{iJtO`gxqc%I7~w>3KYz)t8ajAwakc
z|7iQCYWqtl-Bw;`-#kjM2=LF;{B4x(s&5-9{qg|6UthOV`YnO<<yw9>rMv7uNa-&7
zM;?QMJ+BJ?=={&u@=GY)#Xql)^gIx`o$Be8a2=?J*;i;Cn@EPMj_siIu^>a+SR>fI
zJb*?~8Rx@wK*tS|MzrN)ArXSBp5#&bxk#k^K=Xsq0Ieb&-+=3Y4jiL9eLYJ^PBE9$
z=%=rx^b(%lBGXY%FX?&T>^HGp_W12`H_7<|m(%9!*-z=Na?AY}Wb`~+e$ihAl<wlM
zpmbN6HBmbLMoIf~j&I)vO1Jp|^>3r}Le5|3^Y5l~EORZNmG}NW<mUlP?l5G8ts(rQ
z${Wk_1*szs;9{bgZPLu}?U+Y2E<d(Wy33CnDP0tow!@ddozh)??(RqaK}w&-^RUvF
zKT<sTb<gcfC_T_WfuB%>=>I7Fvs}LH{>6CQM(Hm9Y@~E&{wdwHw&<qx3%P!_b~#At
zF8_@b2}b`9`CCExE1`5-`Q?Gmd6Yhn>%V+mWc842u|h;2t>{Arrf=#eeFvqx+QJ@6
zcg=+lQM!m!9pmf2;b^?)CGd|*r^ON4H-*w&ea(DIKbiAa$U26QE#Q=6i1**{d#P}U
zc&%X&kccN%6rr?;*A~JMh<M8!_;LsS6$ie;fv*hVSlzNbAIE&E5b?w)u5NW4@mA?;
zei89*<+u7e;@zgN`9;KA&2RN}#9O1U`9;M0I=|J|5pS)&<`)s~4t}ezBi=fF%`YO}
zdVZ^~Bi^0*nqNe`yZEiXj(B2~r0>;5#Jg8r2I7c^zvK^xh_^8e0uu2aaNwI9_=67o
z+YbCY4*a_ge2W8r$bmoXz`yUnA93JY9r&XT{4odqxC4K}fj{ZMpLXESIPf1k@Mj(P
za}N9k2j1ntUku@60LfzglIjO=y^Z}t192$4obW~i$J>ba5~0B;;_VCqbQkencHp}l
zxR?cn(jwl^Ll^=P?==VhO9%eC1AoJTzv;k#?ZEL=ig1W{-C+=ri1)Sw-{ZjFap1pq
z;5`of4-WiY2mYP|?{(nsJMeuD{4Wk1Pu&QIh=-?eghRwT5C#E>cz8-jI7GaIVGxjr
z_YVjDPY3>y1OM28`|tH}f{4cst6>QdaY_gS=rZErsUhJI@p2qEo+=U!5f4up35SS>
zr;da}#KTia!Xe_}sU+bL@$i(AaEN$#YDqXmJUqoD93mc`Y7!0+4^KG>hln@IfuHHX
z^Bg#yk`fLP4^K@Ahlq!#sDwkr!&6nlA>xg3;JAYx4iOJeVF`zbho`cHL&U>VTEZdX
z;i)a*5b^L7mvD%9c&bY{L_9p@B^)9ip866F5f4v+35SS>r^19o#GB&4#g1W!A>x%f
z=<!sUaEN$#%1kg^3%A)Y|5y9hU!df6h;iKf1>Fy!cpJn>f8kIGh&2eMMZ6gy`8Z;t
zoZUo|Lor4X@A446#A6}+Z-Vc5@t6ORE&3D=iR<w~!l@yArohGSXCUnx0vEeT0sM~w
zpBljMrYIZ#UWR`H;a-7@)q4Q{FM-Db7~V`4`_JMjR3Pn}0<R2Uc=J1fUlYQ|iUNv<
z@WldO5W;r|d`SpLAmUvna&0C?pd@Vp`g0o$a)hy=H0b9;7y=QGDwUB6{l9_8+lWV-
zcSb6V3kIU-k8$8)Hzbr6@n}=aNP%(0K;&)2qs<Q^6~-3>QS`$dc&-B%I}V{V7<WP#
z0uk>d2QGGZLTM3?w`M;X#wq`XT}8Zb`@&=xzZjK&8Szf@Z`c)#YwXUyjCkT{pbyX$
zjCUanfrxj813%M&pXI>w9QfG|Jl}zz<G@Ed@N*sbc@F%12R_Duk9FV|IB>Dc7)pb2
zIfNkq<FW%E@4!FjzzZCBp#z`bz{CAK62ds{pufn0Pjuk8x(o*x_ro9{5pR+MpX|Ut
z@4%-xa5Z`dlOu3uq%B0eOKmXNS?a*U;~)~kyuv|0-GR?=;86!29%qpd<_`|~@Hi{!
z%N+DqIPjScyxf7$a^SNa_>~U4!hwI$frsZ0NEq?L^9PB~ambnLz^``T;dzLZKhHsr
zv(0dbc-MtNKqB7t4t%}?zrlgu=)lACB_xb^3mo)S4t${luXf-y4m{z&7di074%{`b
zt98(S$${VG!0Q}%y#sG>;EfKv$$@{_fj2ww76+aT;U~b<GoL-bP%{jC?<QZIoGV$>
z46IY&U(s?D{YwI0p>ak3p}=p^xSFvY7WgWStC`tpFy+l>&m$CGB=Flby_#9h6!>b5
ztC4t-z}JNETLu1gjVt-z6!=<=tCq7<;CE<T%`pB`;OjJ=M^^^{x6NYDI2!X6rY{!w
zdM!u!?Rx^hQ{$?pj><tfVf-F}-=*oJbX5pb|9tkmL*cIr{2ooO%JKMPk^Wwds}buS
zf#0w3adb5nmJ0dod57weFBkX&nqKX6)B<kv&sPQg>zZE8jQ`z1|64)-rlwar1!ITV
z<re{-@4cbv)d+oygMPiB58J;@;16m!<LT-(2mZdmzpd%jjO66w(4L1h{thiM7YqD5
z8drYUBJl5O{ARlP3^as%FO1(U@GY8N^*F~2NBS_nO5nfI^f%E}x4_@lcoAKl1%;UJ
z?Fr$F1pctbRefG7@b7C}t#Gyq{1J_-9n#kY{*K1aqpPz(V7~Wzjpxx-oxrzhT*+T2
z@JB=VR)IgJan&!qCh*5Ku3E?k0)Il|>*;DFFoLO}a0cM{-jf=?ldfh7{ArC}M^{Ik
zg!Ezj1%W@K>DA1;4LYuTFN_zSjQEc=y^?>uz@OE)vS+=(pVK(LM~PuO;I@3>lGuiq
zIPew+{sX}Cy)K~(|KNL>7=G!X|Gy4=?5TEpW;pQK4tyTq`Rw^up0LLA?h^P;y7TnR
z`xIE}=d<Tu<A`Fbz;|hS)vx|Z;IC?YCS9$Bt$}>@{7cEngNC2ao`1ou2rt0j6WMzU
zYX9;YAh4B7l>^`8z#nzsKXKr%Iq<gu&-eC|)%4yU+>M8ZneV->aUc_6xxf#E@Ye+X
zhQ?LDeHu76pFQ7!?oNbT1pW_AKbEfE6!_mYekEO<cLvJ&r^fT?YB}IGe{K}?A8C5k
zFYOcf#~MGEuBy*O`ThptHwb@M;HtNScfbgborUzj)^b$79vAo!YMt=i1EQFdhxGn)
zB7C0@!%y=N@5W0JW>I|F*@*YhoxhX2QQ&{jxGI+kV0^y!uEs0q>Un{`r*Tzp=beM}
zy&Auqt{!&a2L=AVroV-*T1KOsF#Z>T@6+^`(bd<$;rZTQG_LF)3k75=$GZT}XU`cG
z{RaZ~pQ9<f;Cws%T>|%?$9;h;_(0&{`WzjDa>Dg+qrm;=bgKURr@;N^ajHJVpkn8<
z=Xc7^|08h!IURH_B77)t|2dtiKXWfYIsS9HLZW{YaGT%$<iP*xz$2eU`Tp}hs5T-j
z61e}oPuX)0RHS_NJXGPk1nxfve3j&kz7XmC=YX>b|E9qG=YYzeS>usD++G?49xlh%
z1)itdo$~W>pF=tRbHigv??Qox>(6F^`_B`PCwi{{<@nDNmHdAbc(@$@D)95PJ<~|e
zRZwvG>^UQ3Uxc3mZY#%Kf?mza{c&yb1e6mlmk$N*KbM5!6XCcbr1zgoDnI{F;Qn(-
z<)5RWqs#Zg{)`Gd?9T@T?mx#IL3%#~+?KE0i|qI*fakO4p2|Pp5_s4Txf7Aze;%sJ
zt3}}9eC-mr|D06yOJgoZIsS7}Rlbi1+<#6wkMeaiRQP=MoK(?&S>XP2Qib;jT&;dQ
zPqp7ElTc2nt|upu{PhC&pOdP5{Yl`{G<|~T>n7X%nRMXm5YHxN7X7P({s{;EJm7hZ
zUtcELs^CcX#Q5r_CNF+nWwN|*Qd4cBI&pJtOCk<O$+sk%TdR{YsU#k+ZLDpmrQ5=y
zs=B(y>MA%ZK3>(lSkg?aUEI*voPe9+!Z@5&4`<d_$C0PDs;+i9+%-1Ds~T$JH3=yh
z?+a>cfqwR+czkg~YrMLxt$YIT)+82Hwbmu$pyH-@Q)69i^|GiJzr6g)X;aHVa_ho)
zS-idq4$O~VcU^fQ*tR&iq#4e(k5^TTbMl*(Mkf@6(PT5~DsE`3uPRztpS%c8#!oa)
zzbLvi02dYr(_6s$7D&8mW@Sr#y^~@Fu~#N1%Cqy!X2WImlIF&S#*&)WW|8f(n#$xw
z;<Bu~t}qJ!3si_gsH`fhERIK`%9KpVVs@os3|KM!qO!Wm!nm?98eJNN%+IbYh?mWY
z&nW==&<T~v!U^%K#a9C0O8_m2L1fn#R15XcH`vDqslfMV#zY<gD}CG04cg|a0eCjb
z%qyygFFwR;lkkxVXr*Np=~qjvkChk2MX3+S8kHBnbr%*jByNuDzGE<1%&1Uhsio=G
z_0@Ez7naqDW->>(X=b|o7m1H_$POqKUTmfVPzh{IQ^U>m71-rfCaW5tITyvt8e8I*
z)h8F#RV{9*#Eeuzjd7(^H-=Iq$5mH_5=dCpY-bYFb7fPap}w&uA()`Hmehh-*aH-|
zB$A7wVc`YRROqj(YZDF0+C{b1phvS|y*2y#35KhS;#W^7oY0blv13`h8i$*rhQ^zX
z0fEMeLJ9k2Tv&hu2#im)4N0iU@vyKcp#C7xXHAMm3nvP3Ew%9Rmw2M7v3iMNY<(?s
z(C%@hu5tqO(bz#3RyWqy!{C}+)|4ni%X}NCC!Y=DP1&5n_#9-3MvIXWN4ksS@Mo%Q
z@@f<tnA$5cpmxwct(<7}2n=SBcR$;pCs3Ai;*}+kjDz3wcv<C)c;zIx#rmY%C^+Q^
zS$9PStn<se&xr?gB{i(~(wW1#lbX83tE*a)aU7X~6{%0bQK*XkJffz2A<TK;e{^D8
zh=Xce1P+Bdh%SaHM__(Ilc&O>fC=enm*SzYCM6oG7S_QixTtJiOEVU85zJ#-bml-Y
z^o&cZ>RJ;~sQbDdNTaBhV7bpUI+Bawi%d-|i5jD)2uvL(=&56Taa&uwDbd{0*Z?!(
z<g)nE0;)`dpE6RfRM|4UQ0U8);EU-CP*E}Ut!$a06lGwZD`m}XxVaWq16L(lTH$+G
zyt59tQD&>J88~(7x`;TtGTWM1R8><WT6sZS_VJ>P_f4vL#T1*EkmwH?DRQyFI?(0x
z)#z%DT=zc!wS(^dp}!e`+F&m+fWdBHY6p?qLH7WYnp#_yh>x;i`!AMrq9XX@8~7sK
z$_f%-?7JBML4&MJ7QqMU1}%nlp1fu9I_`I`oZ@2oJRQxutnG55{Cb{0H5;@)i(6`!
zCo(9nwoP`S_B>)i^_4#?fxgm<@neF05tOL!3HC)aNq<-ns9L_<jJrIJVk;~Oe~nOz
z*0rs^pQekGR5m5WVfkX17|1zmGklv+%v+)H7RF)JhWTtU4B4<B1s^a>*1~XJ2GcsI
z+r~5(R;psUC+D(iRHq?bY+vAH0~R|-7!}hypr_$dZjpQ{hF_5(!~5zjQ}j4Ycrxd4
zoM1zTG*Qg)nA<}h|HKXN$K!n&Zg0ee6ZnTW`)X(t3U%uVvsgR0!Xo@=XQCPQQ=8#4
zp3#NUe}yHmGKTF{iNfDRRef2tXcSATn&Zjls@h~rSv77gR^y(5zkF89Y>-o#tZOL?
z^F+mJUCM7%qG6^C6}4h!HR^{6O?n0tff&_Gq{pcyaCXm6l=&x@SGHUPA1O^P6SpFB
zPLEi!C}#_D3A_+DWFfz}OcV}ptZRu@Tn1cvdja-E{iGUAB&W-%4ZLzUr}k4>g4G<^
zC72Wsy>!NwV0w7zdk!sqn$cw~?vStrt=zefMrm1F;4f=&3a*yDYJVS+GrrPAkxpId
z`W$vT)b>w!*_*o4*>tWlSq$5n@<~x~qM@eF)-l;S8ESzvDyt^fSV;{HyE5p>#M2tF
z-i6)>Mi_)e^TP|(U0|=4Tmth3`Cw?0{^+rFRe=s&iqnH|yQMXRP>_M8?{vKb>2LcF
z_9=%JIrt|~GqkLXnO8Uoz8$?dk&Mr2@;Cmehc1cFNI(zXxNIPgC!mLpE`{X^%rKx1
z;0#X|QpPM-&cNmSpqCXTFp-bzrA1{D_t>CrHr^bsOEkdaOZh2pe|*h=#_Fo3s%qG7
zB4-y(h{HqEn-X#PU}#aip)t``ooIsHy+J%Af`(>1X@X%h8pTaaHMt(N@4#2HRNYC4
zN66KUP0MiRVyLdbIsx77q<B44jA>**#wrBt9?pVi)AaZ%5k+-4-d83shR><rT-97t
zX75D$p5jKYoD;ta`kumx%?VL9&51>c<^((jDDyWZscuiy&(72w59+?H-|9zYbdr9+
zk5e76Te@z>oa#F8SEc{_e-V5uzNsMI(o|iibBT>@Q4>y-p&N%;;hgx4rYrsD27@ke
z(Px<7n=jKwE(LA6cFCn%UDDdI>#<sFjxEH}VJlDS+i6{>9*0m@FXg7P6&`vhgTqO1
zDqx>2U_$}7qyB+xf?M?(^8z#_lQRN6LZ;#vTODM+fx^L8A%t(>D)@lFpGgUhfqThi
z1U@cPleHt}rT*ut6H<HWBBeJ<ARG8}IF*+;NFF@Kvim0}B=yIN7g--Gmf&MW)g2F7
zwPUw6b=9fV-B9BSPUvObq3*>|i7ibrldU~qkVRK!n|)w9U5M{@)g@r74#qv~UqaP0
z<UZNNpqu0EE|7%`&))n|U%XwUCbk0_E!9jA#^@vT@{?ND94Q5Q-IO^6M?fU4-e8v{
zK8S?@tqQO{;YL-)O3b;Cfd){Rv5A?#P=G#@-o6S?bZu?U){|2Yg{==a0kyo0Ii-+I
z&BcQbSlYlsqqRZ5_?f7I$z;>AIK1STk$k{{50@BXg#j~tcvVBJ|HMumyg^zOUzk{2
z+W;>bB&(Nr@LU0q=7d;k(H(3_s8s&uX$Ia+D=Y}C4lWX_gN)XW;M_bgbq77>X4}L#
zz8tG(`tc=+I(Q8M-g+pT(S&n&{qn<7wFiW=d3fmsr}Hz^@+o6gJGkb!B6DkwjI7YX
zwX@>YOA_$PF)YcW^>Qr^4?&B~>Ks}?R}E<Y0-i(Bi@&lim*8{Pt6&+D>9yX);*D$A
zWr67jZggQG>-EzSTCC~P<%_k<^~JjX67S3oYH(Luav6Q0RL>X&Hs9dW0U8P8RjqAh
zVvz)fssbIzTfJg<EXjB!gE?U(>@Uept}-0DtSe9GlHe^cpu@c|zeCbJQN8Yo)OsPb
z$up3#Z-N^8GFr+ddMgNe3;9ZIVM%!C%iO@nM?L5zYU;#`yzJ)_85qQPFAZxdF$;l_
zVkvYPxbGs~x;?^Un(jQ(8Pl#vbxfo8TI%K7F0k1(A@hx)02+g8q@vfg=LDJpQCXTo
za8Q<S<z}?{s-ACPTZn4@pHY+4^P8C&mFmx_sgc6_CUWzAA^vCxf69aj@M63idU1om
z(u_s#i?5K%G%>BsRNKZEj^#99TAX%LGwOn1tHuU~9kkJ^Gpf;_b)Y8fKG3-tRY6ry
zJaDu>5E#H%0qYB+n-z=5cs)KQk-vG$%tA)Ks~?}`%Y~OL3X9a!81WhuKFa{NfkK#t
zie;3Td{#{mQ-J{t|5)s}(+wRIyyvH0fGUI^d1W|vqOpJq;W2!>T4n>*hVY9(<;VeK
zfV?)_7h?08QU{Hdc`cJp!>tAL@zVfmTQNTUrDxj7Krsx!H{}cAbpzk$dbJPRb=nKU
z2Us|;P@mWgzsGE>ho4PWCt6zcbHI4AYH?I79AVjBKcL4;(Aq0z!qOaeeX$1DC*TJu
zOB1qp)vwSLmqAsKZ8%=j(imR?*3?;v<gg&t@>R)18~zM6-XtIG$8So&Pj}$`rzHK*
z7Jhy!0{(P%7W@kLko>yr;TYlg`{@Px*SSDLIMUBy^j9(ZDn@@DqhHAAYZ?AohPN@i
zn&BNx{xNXfMECbF`Wi<61jF%cVwJBhhI75IGd#iQdl`<u_E&PU;2Hz^0Z+A7xcY@W
z;@tj8M34SN`&Yq#4C)urNYCw$Gx^;928MI{S2CR2e?P;y{Xb^-Jh)c&?_&5OhW{tS
zd4APzuF?Lh82woYKrrjiXu>h#^=B-j=k=$M(ewIK%5d(7s~OJiS;*w``n;IYb9?F-
zJ+~*x=(#<s7|!jvli~O)dgbTG7=9hYUt>6)0-)&kG8}&`ukgPTZu<Ekqv!ehh|yy`
zyaNjf!qA9}G5Ja+e-y*7X875JtNMJOWcS80dM;-?qv!sd$mqF0qYUSMt6}&J%%0T@
zzmehJVL12O_X#)s_86n*etVM9bH6>y=y`sB#c=Mo-x_lMmHhU5M$hHE%jmh^_Az?y
zH+9rEmLr~Cq3X%-AyScvk6}2kw?z!cQ#O>GD8o7ZwG8L|PYuJloTUusa_(R_m-8)#
zbNa^}_)84ue)z2e{{zFhoyQNAe#3e~Bmy{zaP;R-n14nx`ey-G;S7dzIr$9V!RXIt
z@}Fb!cauOQeV)<7F;F6)Jtr}Gj^imjsvPqf{RoD`@g^dmoKqM+o#9;0T!x>_=;4?t
z5l{}E>Z0_*F?1UL48un={CI|6;=r$G_;HMWB*V{OI2>mr0@}mzOBs%*_9#2mc`r!M
z?NR5PAfE4_$J2I{99MqNWqR`%Jsb-o0?NnJd6b-ThI74j4t$*h-|E13FnkQt`zFK3
zGW>mp<0&@E&ch7n?G)=62Fw?p=A-D(VmNPiyd1e4UXEN2r$3j;FA-X$-U$pphv9sj
z<@JH%HH@D7KV3QV9dfQ<IG)a<{L{klVuthb<#u*6dT!@m89tH8Q73<4xo|nyMlhJT
z`jQIbT>e}}&*eYFaGV3HeDQfE%2YTnM?76f;k;b9-ezH?wDVF%zntMbUz-@t>9;VP
z)AMx%j{9#B-D`}V+q0kHTt08_T>dIX&*gAGaC$h-O9b?Pl;!s;4CiuqKQ@EWt8=K#
z^5y+4pLbou<a2vgFr4dkwF`AFpILv-Vsh{+HLBcQ_D^H<We)n83_sHOh2#4~z<iy`
z^z!xfY=-moGS(|d5`nLm`8+UA={OHT`lIN6F~z8t)89<EqR*lGTO9Pa8}v#(?`OFD
zy9|1DCYqYhn)ZClpjYxAa@g|&gZ@;yf6_s}-Jn<UyBzdCGw9*CPZ4;#=K1}VL9gV$
zO}Lrg-y8I*e*c$)exE_F<R5U*|HGhH<6;(7B-5Uw2v_w%$;Y`329y3IgMNY}^Kj0F
z!KBAI3<gs^&LuFI^f=aIQ1nrG>s{oaSL2SNSN2bH(5o+9DSFl3uuL(S_TX9%gQ8dR
z=R4@tT1?R|k?dZrgZ?&#SHQLE-|lAk7a7jSO^!do==pq!_iw!aDJ8owzr6pcWOyah
z%f}J)iL!GQ(X0F_|Ey(lczgK<!#~gDJj(ELhO734cJlFskEgsp$F&9qlOA~$ei@UG
zdt3_V@|QB4kB{GEIH%|PX49GcZU_C}89tTKpNxh>Fzp}5@KQ!Uli}PRwXcQAUtsjN
zI_NhtoR9C%Fr3SOop3Do(<m=!_iq_J&izz9k>6dD`MR3n`<NU)ejQ+Pc>Ng?K`mi9
z#}IDT!xI=im!tNnPz|@|Jck^$Plfbc&LvDfmowcVXC|ZPauzb2%USG@)5z$#oUbvQ
z%js~)xrfnnIgc=$%X!=(=NU%N<@}Q2T+W*gIsd`vxttFe&gFdQkaL*Pb2-Nk!P+D1
z&o!(bp3iXJ4`0l1-rvr2;8!!8_rrX?%G*(~G)J^|K2PTK*D^af&gG0^a<Cokq5Bes
zU+<8EW0`8FT>cUV{w0R<eUF<MuFjpt_EN|2)0jQ=3?EClYQGQCJsfu|0?KJ%^phFh
z$Z)>z#Mg<njJ}D{^Kp&iY95Pvzs%^r!{l&3JkN0M2fn^;W^$%5`Xs{>3~y!lISl9J
zeyIaj^GdXTDU;*EdAZ-n=*@cdF6ra;4`=jT&dm-ym*HnK`?20C`MmzP>g_Tnhp&s5
zGo05?o-giCj(?unvx3Rt>r+l&$LL*la(dIBdx)R=^EjrL`|}nD-o|is4mOtiN`}v1
zcCKRh^$fq2;au-(2fl{kyu5flFJtm=XY}0vYZ=b<wlkdj?GA>2hRN?>xLWt1AJ#FP
z_lLawUdiNiGWr`BzMkPphV%R`WB8qn{#J%>U^rh#^YMcBdv`H<?uQnJt92OW7uOD|
zKR<=^s(w|?v$_B8W%RsX<$hD^B$UtfR<V3t!sM)GIPc&1{Ga>(X-0n^lk@Kk=k$EN
z%=_C>z>5Lx<mG~G69eMB9C1v+fH=1w^Ns=W`{7#o^8to)d$uv0+w%&;xjnyNIIkyf
zF`U<v-!q)o6W(6_6=;;597mfK&insO4CnL@GMx7(-*(`e8P4nHcN{pcKfK<4m(iag
zyf6J<&+u^!=jBz-a6V6%$8b*1_s2N?RYt#s>AjcXoc<w(b33;?@Ut1-#N-z^@aYWi
zWc0iozsGQ1j<{E#%AMoiXE-k}j&nah!sw?kdw#(1^BKOC;k+Dqx$|-P1V+#E^%%ps
zd~9nfUtInZ4CnT9oR9B6Wc1uWPcod_^Ay8*|IG0(Fh6j7Cc~d*a=6|fIq+u~&fDX5
zhV%OWV}_Gj4<ON(sqe+-F?)t8B8W>FuJ&>eTEOt5=^CR|bdCLF4qaoknXVE47rMr%
zi>?tLM%Nhi&^6-6(Y1)sa0rO2Z$sdH9>a$tK%tc3YVR477BKt-MGWyOhM&mr%?v+@
z;av<@WrxW<3_qFC3&Y_a?N?(lCiC~5)E+kCGmsDhMruxr_caVxYbeBTXZYy|P}st7
zHAljvHwnl2!1-h|;=PPs^$CdoH=|c=4e{Iv{*`u~DKEv}0*0%$g9#N3&l8mLPZPt>
zX7~n%=QDg8!_Q&(TMQq~@B<7#m*FFZ$Q#Vwc?>URxaxB-c@D$X+!FC5!_^uV@p~D5
z0Rj}BVz_F{nDiFIRo{j90fwu+T*OCEqec5wUxauu!__lE#OE+vjWviT8Ls+d#8(oI
z^+b))h;L%_MF>#%9-}W-AjH35^lI*n$@>^SiO~<EP6zX)=AxK9j^U~gN4%WjYHUKh
zj^V0LMtmK^FF}C9R)(W)6`m#>{cx!QA%2U|mooeS!>2O*BSTIFrH!P{6m?Ex^ivo<
zo#FEtK7-*a7#?N#CWc?e@Er`loZ)*I9%J|+hO0FlCgb;CF`)fdh%5PL3d7adhbi+J
zuEs3HS1^1Q0u(kee6|81-obFScEjX73|Brz{1C&xhyaCAxaj}^?N@6Yyr05w^-Kiu
z`3#?f0EHC{S9?&Hw1#k1e-zJVMt`*u0&y3^uVHu(!_}M?lmBeUS7}2~Aq33Vwc<+t
z8O`wP7(RpH*E4(;;TU24pHFFZj9y`ww2t975G_Vq8Ga+fpC%k5l;1{aZ!!8fqd&m#
z1q>fSlMKvn6~l`ezL4Q_7+%frB*SYMelNol41bE@ix~bE!xuCB0K=Csd<0E~(EeJ6
zmoWTG44=>Nn;5==;dKn(#PE8C?_hWX!}l<}k>Q6J-o)@xG-*Tozs&F{3~y%me1^9$
zd<Dak4By1?R)+6j_)>=NVff7qKg4iV*D!e$O)k-XwTFZF6oxNHfWmx+e?@^1uVVNL
zhHqy0N``kaT&+DZxrgDmFnW(ByJ+VshUYQ-R)&`{9C_6DSr-_%S{HcZTb9))s}@3>
zY?kp7b%m$yj)xzPB)#!9Rmm!E{KA$NPXOcLH)*MyU&D{8;h*5qLhuX%^GjQyQ*gKx
zePX5-k%|Oykca=#A<dhTdOYw!eQ{{K@JVx{^auI|Tp>RVq=dRY5ag$`&-bto2>I!@
zj|EA^q!7L|WdClELFM>_kgthZqE0x54+#aNKsozAJ#v3jD8K_h@^XD%C_rcZ(vY$a
zkD;Nu5%|sw97cB}z8MrWOsgK)cYNU2U*Qw;`ulv4+k!!78GoHAe*zfz$PxZ@7SAK}
zkFrhalRmnHw3qrmu}vSR!#|zl$Zaa8jP-qFPvC^Q)W4$Ru;+iy2X%r5`X_tKkFc<B
z>jVj{=cB`K*x|R#_=U#ti>s^SbfPJA|L|c;=-T_^(F22%Se)dMf+3cV$>`eUxBnEY
zm(fd8XXy`Aj&u7|^~BTQY^@}NZFyiHO36TPVl7eaeAUmU=;x?E3Gk~S{|1R)5ULF9
zbnIIt!Mb86_di?W*AaWh<o7}Pr4nBW>4!>vD<l;I`Da5?p$~jPBozk7cSM4$x>x>G
zzRV%M6oSJ|nx7QuBM*J|_krdKH`dRhd@>*D2<E|Fi9?^gNQKG$1&koKwKar4hLL)w
zZ{ZDe{`NfjADFOe>X%9N8oDO;-7(48z5XIZ&?Q>N5uaJOJ`UlqXAm=s41Xs=_kh}0
zsrXxeEy9t%f%+Rt(>Mnj259?H(EeG(e2Hi|9ok`H+C_l28HdKI*@N}mu0B7V==_F+
zE{{}aKwy<W<eC$@&~k3@hvbVRkU~Bk^qF{;Nh8$AL5pP{WmKO1M9!Yiyn{kttqZBh
z{JHJ{6?h2Ppudq9k~rwk<*8y7TLy!!FX~%%nUyNOXcx%3zc5Lqs|@XmVPC}7y3S4#
z&yj+|)Qt1;9hUUjBU()`x<j-IGG?BsI>3%FFxSwdQ!4JV%^8`;&|8iJ$oWy*f+Hio
zbJ=5RJJP=Ee`N3fY<)-FaV)Q+(Iay#zQc-CDo3ay2d57iJEq=YVtS1%VB06z+5(5y
z5B^(FrZbMvv9{JpM8<SkXFeH|E^YT$KN%D3`kYjtOa^&?JRaC-d|4_WJ~M+eKa?`F
z19S=d@>9TYnDl(|AAt&3fHQ?bPivrK9N_n$Oj|NnEv=u13aL9nN2%*CBMtKNO{Tqr
zFRIkO<Yem2T(N}D2{&{N?0jxKa6BMrkUft2WJN$Gw3Q?GSqa|bWX|1=Ph@CinK?5P
zhAue9G*|%WEM3@x{GuWpW$yPc@KK9EseGE>w+I-L*`stVpRNdM%VcGQl44(&JJMgV
z2<8KKq6R$zK?XwKiU_FrR6ZLKuppzohkurH;0qFl!U6TAavrK?iWxl~+A;|}QXf+=
zJdweU`$&9OA!uzzeQtabA)pDmkWcRG2m#HR?JI+4soTDgpawkKrv`$CWwcv8B3~U)
zdC9B~gU$ZP9rtf~8#h%x<)i(Nj6?k{S|7I`$or8!B;Qa4<Kw4zUcPC}kvcj*pa;j>
zPvR8)fKt%;$sMR~YCb||>+9wLGqp^fu&?DB1(lI=_5;4bs_>~Fxi2lkukgjInwzVZ
zdE@bg$wULZ0^QJ<OpL#L_S_3wl2z3=(efs-B)+J*sy+el(KR==@)zXUJK5=6#xC?c
z{(D8mTS^!7vlee=7W{4!{?pIJbnN1fS@@4ngJ1o&hU>?3KvLDOey76ubJO5oV6h*6
zN1UquB^Le@)8JP{%k4iY4Sx081I|Ap4gOUY`%g}Tf31c8lr;D^Solv(gMXuie`Ff`
zn=SmOrNRHOg})uJRO`<+3qPKXn=1cy3;&sE@ON4G^U~noW#Py3r&6_lw}l_iw@#J6
z+rmFO4gMYrKc4TLs{UT$r=K&4{__sNQsqBj(ch5<{~-&1VH)-i0a^?>G7|O|rNKYU
z!e5*QKlW7^xc-T0@T+rLIRAt+__3~-`ahor|2PZ(lr;F!SEhbEXED|KgSt%q(lq#I
zSorZA$yD{1TllA?!9T~skLO;ds(-$PAM=zde~pD7&-F}|zs|ysXTzq-pS19orNO_#
z!jE&fRQ2C(;m0yfm46-a^ZsvE8vGkA{8y#HzlHdD`=6Hv|5gkCwQ2A_W#PXr4gMV#
zemr9;RsZj_@MD`umH!tO{u|QZf6KyuV;cN>Ed1-BLrc~EcP;##Y4Go}@L!OI{Rb@k
zcs^08_8+qFm!!cz1b>VRf!F_~Y48uT@J~vEe}sj9avJ=jEc|#bR;vCRZQ;l7BBaW%
z&O_$@!}BIn<uA7AZ%c!JiiLk!8vHXX{L9neFSqcohx(Li{^wZuSEiwVzJ(vpr%Y9U
zjfH<z8vJz@{zYl<CoTMUrs2O87Jgh4r<(uUE&O+-!N1PJkLMAls{dXKKb~8dD*q-6
zKc0V=D*qM>e`6Z_TP^%}o?fc@pR(|OISu|D7XIcm_;*_PaZZ`4{lBpA<9WfU^1o%_
zzdMckx5vVNPa6F1TKMrC*i`M`XW_pi4gLcbemrkBRsDx7{P(7je|3&MAHQ%;o~r&~
zbgmia$MYdm<sV_;zbg%X_5DY#|L!#SM_csYn+E?l3qPJ4n`-`xE&TVV!9T^qe_tBq
zKf}U*e;WMd7XD3X*gwa@|6m&Y^DX>%K5VM~ud(oNPJ_SB!jI?2rm8<_;m0*;s{AV~
z{CEy)s{FTG`0;GyRQcCg`0@PKRQd0<@Z*{~RsKyDemt)=RsJm&emvJTRsO9O{wLDl
zf6BuD!!-DJSorbW*i`M`Y2kk=4gOzP`0-rXRQ11Q;r~$@{Ch0?cusAq`roziZ@2L0
zL7M<rf7c8yMc7CD)94&Yg(EEwc=jpURLViP-v3<*Y@>4xaqkV!6;tEcNTNRvVw6eh
z2!%5@>D)aH(1odg9N;L^#c$G^_l3ZV*ikg!fJ+gS{!t2`*dX)pOf(z+6=c8pY}w@h
zv4dZogV>+`zd88ZQq*4z#-shFT^ophKcw-X%1>dR17=(KZ+=8lsB*^g$8*^fzrv@G
ze$)S}=p4cml_HAKRvxZ!@ZU}RBRDZ!oBX(@L(J6wP>TFGC${nbg!t*0Ic){f^6;pG
ze;4t?>j{Bi@;?lK&HvjUk!;G(=>OFKnf`y%q5p3w>i?xf|89%^H5UB`9QyO=9LN63
z?=+~mAR<(L6N#Ung^3<9>-h)(Z2s#d`xhV+1oYoph)w^^a_Cp*S&k9h{vWiJhnPcu
z><5yHm){)#nfhBD`gc(N^SCm&2J%2?a_C=R(U1GjrvAGf`uC9j{`$Y;q2sod-~S{2
zF9ocXv>d|klG)0yjqHyRF_s_hKbiL9IoLM+*U*C-e%3FFGRqIY`(@MLWzmoOPp1B2
zV7BR3=ZW^$|9{oNzn$!#fkv}n+P@6=;h$H^Xz(&;s{{%NNB`ma+q56Q({8h0ontzV
z_*G=;KL+@1`b$VZum8AyGxb02&_9OGA?0TdqYY;Lf5f4`g7nA8aLoTki~eqh{^g|q
z0wcqw{x==^H<Ere{-XW|Ec$WchAOfR!CPqHIi2iRkztNEiTKU&?|I@MuNWyd)Of|f
zZ!5pGk4gq!e;x$LEWav;{(YpMpCOJCP5tv7`nOy3f7_z}ZioI0==@gGf2RJo9sDzh
zpP!A3_M86ywZr~ivY*$#?^^7i5A$o3X_nu!q`wiBvS8}J2Ka5|SMZo*;`+A$WR~AM
z4*eJ8O8$2Yg{J?0=g{9q`g4dM>;FR*{S(0;Br)@!LlejT#-AG<{HGH?u5WlS?eBE(
zPbB{Ae)7K-_-+1Q^|;j9P0BI<I3)pc5mdkUBZvNylO$b#_Tzc(Hvi2@kv|*O2{!&F
z;wLwd)#3Jw>nt1pZNz_%=9Cwv|HlHq&42T@NqXM@pi52v-RRJN3QauM_oM$>hyJys
zKSummeot8R-|5i5h4kOqkN!Ix`gfE5Hm3hci~fIi=r1~1GG1rs$Fkz#XAb=_%q#@%
z|EJ;J^#A%}?d9J|`Y#qz{6D7tcHp;_e;esn?F9Y*j79%dFwaDpX8*C%k$<!P%>sU#
z{tcG=<67U$|JNP*&pbuuxxezi&7r@G^z-t6)}sGs4*idk{+A7h8@FDULx0H+rTweW
zIS|nQ&p~YF|8Jz<oWHbC!BICLWZ)lD|HNVT^7|U`w~&4%z%0K);J20E_8&=(wJ<-!
zfcAGmY}y|u{igl@pox2b_TTHUf7ltQ)${ViU;mG3f475w9PwZ7vx+Oze{VYSU%Epw
z^7gwEAT$3<V4jOI&Gyqm`o~Ez`md?K8u)GgZzBD?{l09`KLqB9HvOAPe}C(zn1g=@
z@z2zj%L~)~>A-KZe=XS`BYvzuuUhQi>(GDnnUb+T|GnqXzuV%!pIh|bJKXNSV$$E=
z{PzvwH|u2u@o$#d(0|caJoGyB?@v+xAA#TI|6WV}UkAlz{k=5Tp8sRdl2%UV#=^Ck
z&&j}V(=R$Zpy%cP2Hcza?|0~*P5SYjFCI+&cRBROo|9aB{`@9DrvAS;^smd4{7}6D
z!7#^LaDv@`59CR<n*!Wn|6LFKHvd&TFBuDnAKTAw;M%nRC5Qf@XG_Le#IGXL{^uR~
zS6TFTTlAj?6I_&O_J3EA{vxHAVpIRwz;Da{Hqy`gzqc*=+Z_7Wq^Q5ep?{Y}{~n9}
zpE>kzC;jIc`8VtDXJCLtpP2r8hxqgQ;cs#9e@y(B8T_b=hX&xc`M-zq&)aVgFq-x6
zBZvMO`BID7f13IaI`ofuLGttR|AR&UcVV4?GEM*8Nct<J82#7O{~++&@?T2&dH&zE
z=${1hWSjoSNPmChS26J0^slw#|2>QTcO3fPCjI98CtG_$Ui{9Xe>3Ul{ZFq&|0I|v
z+wy<*Ia2;DhCb8(#lUaN|8CO1fcUZhdmpaN`uBoE|7OzPU;Y1yLw`Y+RLskNpGE(|
zQ|<YGkM#Ff|8D?(TmIWfKd=9PvFOhQ0-OK6(bE1Z8R@^K|NjN}ZTdHKNtOk~iT>XY
z*QWm)9r_oNel!1O{ri$b|Nbti7?MRm{U5-UssBG5`d6f=|2Gc(IX{(px&8wd{gY0!
z`~QB@f29<o|C;$P27a6W^GH8$e}A{=zuTezJ<@N^UrhZGC?IfcsQ)}b{A0D{@&ajj
z_}F28g~fmW07kR?7CQ8wd9Kuw&y~ToY5xttZ}Z<0i~s&<(Z9o?zm)X%JAT6kqs@Pd
zh@T>z3)6q60Kd)tjTZlXY_b1AhyHb>_#0ZYyfFQ@(V>4E>F*+bY=4+hv;4-6vgdy^
zIj)%aRb=WP4g9wJH@zr1c>BY>WK;h}hyHn_zeFjf7_-5{Jr4bANk4CYLoE7Zuuevq
z=KNy=>3`o)Xy$)9@Z0jgo%Ht*Kl=YD_;2RF%b|by7@5m2@XW%sssCAr{*5~&Blkb9
zrA+<f&a~%$JL!k!6$qyO^MT)%|7{li$6EBaI`sdF^!L~PnjHFfS@aLH=zrCr|3gRq
zK|~<zbm;G~=s(_~fBac?|DQ-5cYpo=Hy!+W#1GwjAeiO%cL)Du;y2r$$^YNLZ!7<U
zmi(V+$$vc0p8sajUl_1n(sJlJ;J4||{h6fV^?!s#e=qQ3nVRi?3+d07V)S1#|9=F2
zoBlBt{ij&;UwyV+|H}^lqpduA5%_KTOG!WP|3+H$p9vX6nP&b!CjGB+WpHig|1{vY
z>2D(a-NcXm-{}_pYaIGtxj-_`HuR&dJgjo)kG(9ZVnmGkM_KeA4fA}IY3Bdr&q~H~
ziC;yg{%qj4<$nX|=k@PQi~cr;{>h}jzy7zyq5olv{ydBR9S;3-NIyKD3IwzKPaSRd
z-;Km?_Fty|ZgcS0Ir4AvuLORZ|GO;tA8pBh0TeLGG|T^Mq<>l<_u=~gS>U(n@3z#x
z^DO!waOmGf`Y$x}oB6-bp+D~x$;j*97>oW9=h^fBchcYA{=;zKx8=Wt^z;4)-zCbS
z{3yHDIrM*lHeSdK?FBRck2?655I?vl5KR7u9riD=`0qlC{qxSZ`)@tz@2~wl>EPc;
z{II<h2xj~H49rt7PiFnyO8ouRpHqO}R(`82`N#8T%<}ubL;u^P-)#S;{inh>W3#`P
z_%93OUea<1=SepEH<JA^BF6q7-xW9Qzs;e4JUPznf6-PRRyy?eSo}B9qJKDyb12g+
zKRlNW!#o-3zo!0Uf!~(@gQR}}abo`Q`~frnH#_uiPBH(@4*m0XNyVz2qkcR;z|_Cb
zq5mf->i;i?{w9lld{^JpzX&=QfJ6R&i}d$5|EvOjoB!8Z^y9NIQ$P0eHvLD9m;Qst
zJAq)fpFC(MHvUtI-)w(K%foFB{&R^x!HMD8<X;K=w)}6l<R8xuGV?zI{ERk63_Tl2
zzoJ*k=KXNsx9Q(*(LckYKM(jZPimY}dNz`NbN!_9ui~-9ulQ8_1o5kT6)Qe<e+luM
z<+`2t+u->y4`%)+Ir6_-<$svu#`YJ3|B7F=ORQg}?~$YTEy;H!$mhYdAMZ`uk-p#(
h$<+(}BZhJyJkFy37)Z0}AGt)Zd2ddZvHEN3|NlPE+>HPL

literal 0
HcmV?d00001

diff --git a/thirdparty/bmt/build/src/CMakeFiles/example.dir/flags.make b/thirdparty/bmt/build/src/CMakeFiles/example.dir/flags.make
new file mode 100644
index 0000000..efb7961
--- /dev/null
+++ b/thirdparty/bmt/build/src/CMakeFiles/example.dir/flags.make
@@ -0,0 +1,10 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake
+
+# compile CXX with /usr/bin/c++
+CXX_FLAGS =   -std=c++11
+
+CXX_DEFINES = 
+
+CXX_INCLUDES = -I/home/bemdeppi/ham/thirdparty/bmt/include 
+
diff --git a/thirdparty/bmt/build/src/CMakeFiles/example.dir/link.txt b/thirdparty/bmt/build/src/CMakeFiles/example.dir/link.txt
new file mode 100644
index 0000000..868b0e9
--- /dev/null
+++ b/thirdparty/bmt/build/src/CMakeFiles/example.dir/link.txt
@@ -0,0 +1 @@
+/usr/bin/c++      CMakeFiles/example.dir/example.cpp.o  -o ../example -rdynamic 
diff --git a/thirdparty/bmt/build/src/CMakeFiles/example.dir/progress.make b/thirdparty/bmt/build/src/CMakeFiles/example.dir/progress.make
new file mode 100644
index 0000000..abadeb0
--- /dev/null
+++ b/thirdparty/bmt/build/src/CMakeFiles/example.dir/progress.make
@@ -0,0 +1,3 @@
+CMAKE_PROGRESS_1 = 1
+CMAKE_PROGRESS_2 = 2
+
diff --git a/thirdparty/bmt/build/src/CMakeFiles/progress.marks b/thirdparty/bmt/build/src/CMakeFiles/progress.marks
new file mode 100644
index 0000000..0cfbf08
--- /dev/null
+++ b/thirdparty/bmt/build/src/CMakeFiles/progress.marks
@@ -0,0 +1 @@
+2
diff --git a/thirdparty/bmt/build/src/Makefile b/thirdparty/bmt/build/src/Makefile
new file mode 100644
index 0000000..8963c02
--- /dev/null
+++ b/thirdparty/bmt/build/src/Makefile
@@ -0,0 +1,180 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake
+
+# Default target executed when no arguments are given to make.
+default_target: all
+
+.PHONY : default_target
+
+# Allow only one "make -f Makefile2" at a time, but pass parallelism.
+.NOTPARALLEL:
+
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+
+# Remove some rules from gmake that .SUFFIXES does not remove.
+SUFFIXES =
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+
+# Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+
+# A target that is always out of date.
+cmake_force:
+
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E remove -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/bemdeppi/ham/thirdparty/bmt
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/bemdeppi/ham/thirdparty/bmt/build
+
+#=============================================================================
+# Targets provided globally by CMake.
+
+# Special rule for the target edit_cache
+edit_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake cache editor..."
+	/usr/bin/ccmake -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : edit_cache
+
+# Special rule for the target edit_cache
+edit_cache/fast: edit_cache
+
+.PHONY : edit_cache/fast
+
+# Special rule for the target rebuild_cache
+rebuild_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
+	/usr/bin/cmake -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : rebuild_cache
+
+# Special rule for the target rebuild_cache
+rebuild_cache/fast: rebuild_cache
+
+.PHONY : rebuild_cache/fast
+
+# The main all target
+all: cmake_check_build_system
+	cd /home/bemdeppi/ham/thirdparty/bmt/build && $(CMAKE_COMMAND) -E cmake_progress_start /home/bemdeppi/ham/thirdparty/bmt/build/CMakeFiles /home/bemdeppi/ham/thirdparty/bmt/build/src/CMakeFiles/progress.marks
+	cd /home/bemdeppi/ham/thirdparty/bmt/build && $(MAKE) -f CMakeFiles/Makefile2 src/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/bemdeppi/ham/thirdparty/bmt/build/CMakeFiles 0
+.PHONY : all
+
+# The main clean target
+clean:
+	cd /home/bemdeppi/ham/thirdparty/bmt/build && $(MAKE) -f CMakeFiles/Makefile2 src/clean
+.PHONY : clean
+
+# The main clean target
+clean/fast: clean
+
+.PHONY : clean/fast
+
+# Prepare targets for installation.
+preinstall: all
+	cd /home/bemdeppi/ham/thirdparty/bmt/build && $(MAKE) -f CMakeFiles/Makefile2 src/preinstall
+.PHONY : preinstall
+
+# Prepare targets for installation.
+preinstall/fast:
+	cd /home/bemdeppi/ham/thirdparty/bmt/build && $(MAKE) -f CMakeFiles/Makefile2 src/preinstall
+.PHONY : preinstall/fast
+
+# clear depends
+depend:
+	cd /home/bemdeppi/ham/thirdparty/bmt/build && $(CMAKE_COMMAND) -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
+.PHONY : depend
+
+# Convenience name for target.
+src/CMakeFiles/example.dir/rule:
+	cd /home/bemdeppi/ham/thirdparty/bmt/build && $(MAKE) -f CMakeFiles/Makefile2 src/CMakeFiles/example.dir/rule
+.PHONY : src/CMakeFiles/example.dir/rule
+
+# Convenience name for target.
+example: src/CMakeFiles/example.dir/rule
+
+.PHONY : example
+
+# fast build rule for target.
+example/fast:
+	cd /home/bemdeppi/ham/thirdparty/bmt/build && $(MAKE) -f src/CMakeFiles/example.dir/build.make src/CMakeFiles/example.dir/build
+.PHONY : example/fast
+
+example.o: example.cpp.o
+
+.PHONY : example.o
+
+# target to build an object file
+example.cpp.o:
+	cd /home/bemdeppi/ham/thirdparty/bmt/build && $(MAKE) -f src/CMakeFiles/example.dir/build.make src/CMakeFiles/example.dir/example.cpp.o
+.PHONY : example.cpp.o
+
+example.i: example.cpp.i
+
+.PHONY : example.i
+
+# target to preprocess a source file
+example.cpp.i:
+	cd /home/bemdeppi/ham/thirdparty/bmt/build && $(MAKE) -f src/CMakeFiles/example.dir/build.make src/CMakeFiles/example.dir/example.cpp.i
+.PHONY : example.cpp.i
+
+example.s: example.cpp.s
+
+.PHONY : example.s
+
+# target to generate assembly for a file
+example.cpp.s:
+	cd /home/bemdeppi/ham/thirdparty/bmt/build && $(MAKE) -f src/CMakeFiles/example.dir/build.make src/CMakeFiles/example.dir/example.cpp.s
+.PHONY : example.cpp.s
+
+# Help Target
+help:
+	@echo "The following are some of the valid targets for this Makefile:"
+	@echo "... all (the default if no target is provided)"
+	@echo "... clean"
+	@echo "... depend"
+	@echo "... edit_cache"
+	@echo "... rebuild_cache"
+	@echo "... example"
+	@echo "... example.o"
+	@echo "... example.i"
+	@echo "... example.s"
+.PHONY : help
+
+
+
+#=============================================================================
+# Special targets to cleanup operation of make.
+
+# Special rule to run CMake to check the build system integrity.
+# No rule that depends on this can have commands that come from listfiles
+# because they might be regenerated.
+cmake_check_build_system:
+	cd /home/bemdeppi/ham/thirdparty/bmt/build && $(CMAKE_COMMAND) -H$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
+.PHONY : cmake_check_build_system
+
diff --git a/thirdparty/bmt/build/src/cmake_install.cmake b/thirdparty/bmt/build/src/cmake_install.cmake
new file mode 100644
index 0000000..8c26235
--- /dev/null
+++ b/thirdparty/bmt/build/src/cmake_install.cmake
@@ -0,0 +1,34 @@
+# Install script for directory: /home/bemdeppi/ham/thirdparty/bmt/src
+
+# Set the install prefix
+if(NOT DEFINED CMAKE_INSTALL_PREFIX)
+  set(CMAKE_INSTALL_PREFIX "/usr/local")
+endif()
+string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
+
+# Set the install configuration name.
+if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
+  if(BUILD_TYPE)
+    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
+           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
+  else()
+    set(CMAKE_INSTALL_CONFIG_NAME "")
+  endif()
+  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
+endif()
+
+# Set the component getting installed.
+if(NOT CMAKE_INSTALL_COMPONENT)
+  if(COMPONENT)
+    message(STATUS "Install component: \"${COMPONENT}\"")
+    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
+  else()
+    set(CMAKE_INSTALL_COMPONENT)
+  endif()
+endif()
+
+# Install shared libraries without execute permission?
+if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
+  set(CMAKE_INSTALL_SO_NO_EXE "0")
+endif()
+
diff --git a/thirdparty/bmt/include/noma/bmt/bmt.hpp b/thirdparty/bmt/include/noma/bmt/bmt.hpp
new file mode 100644
index 0000000..d41751d
--- /dev/null
+++ b/thirdparty/bmt/include/noma/bmt/bmt.hpp
@@ -0,0 +1,257 @@
+// Copyright (c) 2013-2017 Matthias Noack (ma.noack.pr@gmail.com)
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef noma_bmt_bmt_hpp
+#define noma_bmt_bmt_hpp
+
+#include <chrono>
+#include <cmath>
+#include <ratio>
+#include <string>
+#include <sstream>
+#include <iomanip>
+#include <fstream>
+#include <type_traits>
+#include <vector>
+
+namespace noma {
+namespace bmt {
+
+using rep = double;
+using period = std::nano;
+using duration = std::chrono::duration<rep, period>;
+
+// make sure we have a steady clock, if possible one with a high resolution
+using clock = std::conditional<std::chrono::high_resolution_clock::is_steady, std::chrono::high_resolution_clock, std::chrono::steady_clock>::type;
+using time_point = clock::time_point;
+
+// convenience duration types
+using nanoseconds = std::chrono::duration<rep, std::nano>;
+using microseconds = std::chrono::duration<rep, std::micro>;
+using milliseconds = std::chrono::duration<rep, std::milli>;
+using seconds = std::chrono::duration<rep, std::ratio<1>>;
+using minutes = std::chrono::duration<rep, std::ratio<60>>;
+using hours = std::chrono::duration<rep, std::ratio<3600>>;
+
+
+// NOTE: the code below assumes floating point arithmetic on rep (the type retuned by duration::count())
+static_assert(std::chrono::treat_as_floating_point<duration::rep>::value, "rep is required to be a floating point type");
+
+class timer
+{
+public:
+	timer() : start(clock::now()) {}
+
+	duration elapsed() const
+	{
+		// NOTE: conversion from clock's duration type to ours (see above)
+		return std::chrono::duration_cast<duration>(clock::now() - start);
+	}
+	
+private:
+	time_point start;
+};
+
+class statistics
+{
+public:
+	statistics() = default;
+
+	/**
+	 * Ctor with name and pre-allocation of internal vector of timings measured.
+	 * If the name is used a first column is added for table output.
+	 * If the number of measurements is known, it should be used to 
+	 * avoid re-allocating memory while benchmarking.
+	 * Optionally, a number of ignored warm-up values can be specified.
+	 */
+	statistics(const std::string& name, size_t expected_count, size_t warmup_count = 0) : warmup_count_(warmup_count), name_(name)
+	{
+		times_.reserve(expected_count);
+	}
+
+	/**
+	 * Same as above with a name, that adds a leading column to the table output.
+	 */
+	statistics(size_t count, size_t warmup_count = 0) : statistics("", count, warmup_count) { }
+	
+	// add a timer
+	void add(const timer& t) { add(t.elapsed()); }
+
+	// add a duration
+	void add(const duration& value)
+	{
+		// ignore warmup values
+		if (warmup_count_ > 0) // NOTE: decrement
+		{
+			--warmup_count_;
+			return;
+		}
+
+		times_.push_back(value);
+		++count_;
+		duration delta = value - average_;
+		average_ = average_ + duration(delta.count() / count_);
+		variance_ = variance_ + duration(delta.count() * delta.count());
+
+		if (count_ == 1)
+		{
+			min_ = value;
+			max_ = value;
+		}
+		else
+		{
+			min_ = std::min(min_, value); //;value < min_ ? value : min_;
+			max_ = std::max(max_, value); //value > max_ ? value : max_;
+		}
+	}
+
+	size_t count() const { return count_; }
+
+	duration average() const { return average_; }
+
+	duration median() const
+	{
+		// NOTE: when comparing this with the mathematical definition, keep in mind our indices start with 0
+		const size_t n = times_.size();
+		if (n == 0)
+		{
+			return duration(0.0);
+		}
+		else if ((n % 2) == 0) // even number of vaules
+		{
+			// average the two median elements, round the result, and convert it back to a duration
+			return duration(0.5 * (times_[(n / 2) - 1].count() + times_[n / 2].count()));
+		}
+		else // uneven number of values
+		{
+			return times_[n / 2];
+		}
+	}
+
+	duration min() const { return min_; }
+
+	duration max() const { return max_; }
+
+	const std::string& name() const { return name_; }
+
+	duration variance() const 
+	{ 
+		return duration((count_ <= 1) ? 0.0 : variance_.count() / rep(count_ - 1)); 
+	}
+
+	// standard error
+	duration std_error() const
+	{
+		return duration(std::sqrt(variance_.count()) / count_);
+	}
+
+	// relative error (to repeat measurements until small enough)
+	duration relative_std_error() const
+	{
+		return duration(std_error().count() / average().count());
+	}
+
+	// delta value for the 95% confidence interval
+	// (not student's t-test but normal distribution)
+	// [average - error, average + error]
+	duration conf95_error() const
+	{ 
+		return 1.96 * std_error();
+	}
+
+	// relative error (to repeat measurements until small enough)
+	duration relative_conf95_error() const
+	{
+		return duration(conf95_error().count() / average().count());
+	}
+
+
+	// returns the header for the string() method
+	static std::string header_string(bool name_column)
+	{
+		std::stringstream ss;
+
+		// add name column if name was set
+		if (name_column)
+			ss << "name" << "\t";
+		
+		ss << "average" << "\t"
+		   << "median" << "\t"
+		   << "min" << "\t"
+		   << "max" << "\t"
+		   << "variance" << "\t"
+		   << "std_error" << "\t"
+		   << "relative_std_error" << "\t"
+		   << "conf95_error" << "\t"
+		   << "relative_conf95_error" << "\t"
+		   << "count";
+		return ss.str();
+	}
+
+	std::string header_string() const
+	{
+		return header_string(!name_.empty());
+	}
+
+	// returns all data in one line separated by tabs
+	std::string string() const
+	{
+		std::stringstream ss;
+		
+		// add name column if name was set
+		if (!name_.empty())
+			ss << name_ << "\t";
+		
+		ss << std::scientific // << std::fixed
+		   << average().count() << "\t"
+		   << median().count() << "\t"
+		   << min().count() << "\t"
+		   << max().count()  << "\t"
+		   << variance().count() << "\t"
+		   << std_error().count() << "\t"
+		   << relative_std_error().count() << "\t"
+		   << conf95_error().count() << "\t"
+		   << relative_conf95_error().count() << "\t"
+		   << count();
+		return ss.str();
+	}
+
+	// writes the raw data to a file (one duration per line)
+	void to_file(std::string filename) const
+	{
+		std::ofstream file(filename.c_str());
+		file << std::scientific;
+
+		for (size_t i = 0; i < times_.size(); ++i)
+		{
+			file << times_[i].count() << std::endl;
+		}
+
+		file.close();
+	}
+
+	duration sum() const
+	{
+		duration result(0.0);
+		for (size_t i = 0; i < times_.size(); ++i)
+			result += times_[i];
+		return result;
+	}
+
+private:
+	size_t count_ { 0 }; // event counter
+	size_t warmup_count_ { 0 }; // number of values to drop before taking data
+	duration average_ { 0.0 }; // average
+	duration variance_ { 0.0 }; // variance
+	duration min_ { 0.0 }; // global maximum
+	duration max_ { 0.0 }; //  global minimum
+	std::vector<duration> times_; // all measured values
+	std::string name_;
+};
+
+} // namespace bmt
+} // namespace noma
+
+#endif // noma_bmt_bmt.hpp
diff --git a/thirdparty/bmt/src/CMakeLists.txt b/thirdparty/bmt/src/CMakeLists.txt
new file mode 100644
index 0000000..bd679a6
--- /dev/null
+++ b/thirdparty/bmt/src/CMakeLists.txt
@@ -0,0 +1,13 @@
+# Copyright (c) 2017 Matthias Noack <ma.noack.pr@gmail.com>
+#
+# See accompanying file LICENSE and README for further information.
+
+# do not put executable into subdir
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+
+if (NOMA_BMT_BUILD_EXAMPLES)
+	# simpel example application measuring overhead
+	add_executable(example example.cpp)
+	target_link_libraries(example noma_bmt)
+endif ()
+
diff --git a/thirdparty/bmt/src/example.cpp b/thirdparty/bmt/src/example.cpp
new file mode 100644
index 0000000..7f2953f
--- /dev/null
+++ b/thirdparty/bmt/src/example.cpp
@@ -0,0 +1,58 @@
+// Copyright (c) 2013-2017 Matthias Noack (ma.noack.pr@gmail.com)
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <noma/bmt/bmt.hpp>
+
+#include <iostream>
+#include <thread>
+
+namespace bmt = ::noma::bmt;
+
+int main(int args, char* argv[])
+{
+	size_t iterations = 100; // iterations to be measured
+	size_t warmup_iterations = 5; // iterations to be skipped before starting measuring
+
+	// generate a table header with name column
+	std::cout << bmt::statistics::header_string(true) << std::endl;
+
+	// benchmark the cost of timing
+	bmt::statistics timing_overhead_stats {"timing_overhead", iterations, warmup_iterations};
+	
+	// do all iterations, inlcuding warmup_iterations which will be ignored by stats
+	for (size_t i = 0; i < (iterations + warmup_iterations); ++i)
+	{
+		bmt::timer timer; // creata a timer, starts measuring on construction
+		// nothing to do
+		timing_overhead_stats.add(timer); // add timer to statistics object (measuring is stopped)
+	}
+
+
+	// benchmark something that takes time
+	bmt::statistics sleep_for_stats {"sleep_for", iterations, warmup_iterations};
+	
+	// do all iterations, inlcuding warmup_iterations which will be ignored by stats
+	for (size_t i = 0; i < (iterations + warmup_iterations); ++i)
+	{
+		bmt::timer timer; // creata a timer, starts measuring on construction
+		std::this_thread::sleep_for(bmt::milliseconds { 25 }), // spend some time
+		sleep_for_stats.add(timer); // add timer to statistics object (measuring is stopped)
+	}
+
+
+	// output table entries with complete data
+	std::cout << timing_overhead_stats.string() << std::endl;
+	std::cout << sleep_for_stats.string() << std::endl;
+	
+	// output just the averages in differend units
+	std::cout << timing_overhead_stats.name() << " average: " 
+	          << std::chrono::duration_cast<bmt::nanoseconds>(timing_overhead_stats.average()).count() << " ns" 
+	          << std::endl;
+	std::cout << sleep_for_stats.name() << " average: " 
+	          << std::chrono::duration_cast<bmt::milliseconds>(sleep_for_stats.average()).count() << " ms" 
+	          << std::endl;
+
+	return 0;
+}

From d69ef0b1bf52137df2cb059372bdc790019c714d Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Wed, 31 Oct 2018 14:47:00 +0100
Subject: [PATCH 071/150] initial commit of tcp backend

---
 CMakeLists.txt                                |   2 +-
 include/ham/net/communicator.hpp              |   2 +
 .../ham/net/communicator_mpi_rma_dynamic.hpp  |  10 +-
 include/ham/net/communicator_tcp.hpp          | 504 ++++++++++++++++++
 include/ham/offload/offload.hpp               |   5 +-
 src/CMakeLists.txt                            |  22 +
 src/ham/CMakeLists.txt                        |  18 +-
 src/ham/net/communicator_tcp.cpp              |   9 +
 8 files changed, 562 insertions(+), 10 deletions(-)
 create mode 100644 include/ham/net/communicator_tcp.hpp
 create mode 100644 src/ham/net/communicator_tcp.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cf05180..5c48af8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,7 +13,7 @@ message(STATUS "CMAKE_BINARY_DIR: " ${CMAKE_BINARY_DIR})
 ### thirdparty dependencies
 
 # Boost
-find_package(Boost 1.40 COMPONENTS program_options REQUIRED)
+find_package(Boost 1.40 COMPONENTS program_options system REQUIRED)
 add_library(boost_library INTERFACE)
 target_include_directories (boost_library INTERFACE ${Boost_INCLUDE_DIRS})
 target_link_libraries (boost_library INTERFACE ${Boost_LIBRARIES})
diff --git a/include/ham/net/communicator.hpp b/include/ham/net/communicator.hpp
index a0a6164..390279a 100644
--- a/include/ham/net/communicator.hpp
+++ b/include/ham/net/communicator.hpp
@@ -58,6 +58,8 @@ namespace net {
 #include "ham/net/communicator_scif.hpp"
 #elif defined HAM_COMM_MPI_RMA_DYNAMIC
 #include "ham/net/communicator_mpi_rma_dynamic.hpp"
+#elif defined HAM_COMM_TCP
+#include "ham/net/communicator_tcp.hpp"
 #else
 static_assert(false, "Please define either HAM_COMM_MPI, HAM_COMM_MPI_RMA_DYNAMIC or HAM_COMM_SCIF.");
 #endif
diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index 4afd7f5..4c4bb65 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -163,7 +163,7 @@ class communicator {
 
 	communicator(int argc, char* argv[])
 	{
-		HAM_DEBUG( std::cout << "communicator::communicator(): initialising MPI" << std::endl; )
+		HAM_DEBUG( HAM_LOG << "communicator::communicator(): initialising MPI" << std::endl; )
 
 		instance_ = this;
 		int p;
@@ -172,7 +172,7 @@ class communicator {
 		{
 			std::cerr << "Could not initialise MPI with MPI_THREAD_MULTIPLE, MPI_Init_thread() returned " << p << std::endl;
 		}
-		HAM_DEBUG( std::cout << "communicator::communicator(): initialising MPI ..." << std::endl; )
+		HAM_DEBUG( HAM_LOG << "communicator::communicator(): initialising MPI ..." << std::endl; )
 
 		int t;
 		MPI_Comm_rank(MPI_COMM_WORLD, &t);
@@ -181,7 +181,7 @@ class communicator {
 		nodes_ = t;
 		host_node_ = 0; // TODO(improvement): make configureable, like for SCIF
 
-		HAM_DEBUG( std::cout << "communicator::communicator(): initialising MPI done" << std::endl; )
+		HAM_DEBUG( HAM_LOG << "communicator::communicator(): initialising MPI done" << std::endl; )
 
 		peers = new mpi_peer[nodes_];
 		
@@ -269,7 +269,7 @@ class communicator {
                 }
 
                 // debug msg
-                HAM_DEBUG( std::cout << "Rank: " << this_node_ << " in loop run " << i << " created REAL windows..." << std::endl; )
+                HAM_DEBUG( HAM_LOG << "Rank: " << this_node_ << " in loop run " << i << " created REAL windows..." << std::endl; )
 
 
             } else { // create remote windows without memory (join the collective call and retreive the window handle)
@@ -277,7 +277,7 @@ class communicator {
                 MPI_Win_create(nullptr, 0, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[i].msg_flag_win));
                 // MPI_Win_create(nullptr, 0, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[i].flag_win));
                 // debug msg
-                HAM_DEBUG( std::cout << "Rank: " << this_node_ << " in loop run " << i << " creating EMPTY windows..." << std::endl; )
+                HAM_DEBUG( HAM_LOG << "Rank: " << this_node_ << " in loop run " << i << " creating EMPTY windows..." << std::endl; )
                 //MPI_Win_allocate(0, 1, MPI_INFO_NULL, MPI_COMM_WORLD, peers[i].msg_win_data, &(peers[i].rma_msg_win));
                 //MPI_Win_allocate(0, 1, MPI_INFO_NULL, MPI_COMM_WORLD, peers[i].flag_win_data, &(peers[i].rma_flag_win));
             }
diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
new file mode 100644
index 0000000..66d4a55
--- /dev/null
+++ b/include/ham/net/communicator_tcp.hpp
@@ -0,0 +1,504 @@
+// Copyright (c) 2013-2014 Matthias Noack (ma.noack.pr@gmail.com)
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef ham_net_communicator_tcp_hpp
+#define ham_net_communicator_tcp_hpp
+
+#include <cassert>
+#include <cstring> // memcpy
+#include <stdlib.h> // posix_memalign
+#include <thread> // async thread
+
+#include <boost/asio.hpp>
+#include <boost/program_options.hpp>
+
+#include "ham/misc/options.hpp"
+#include "ham/misc/constants.hpp"
+#include "ham/misc/resource_pool.hpp"
+#include "ham/misc/types.hpp"
+#include "ham/util/debug.hpp"
+#include "ham/util/log.hpp"
+
+using boost::asio::ip::tcp;
+
+namespace ham {
+namespace net {
+
+template<typename T>
+class buffer_ptr {
+public:
+	buffer_ptr();
+	buffer_ptr(T* ptr, node_t node) : ptr_(ptr), node_(node) { }
+
+	T* get() { return ptr_; }
+	node_t node() { return node_; }
+	
+	// element access
+	T& operator [] (size_t i);
+
+	// basic pointer arithmetic to address sub-buffers
+	buffer_ptr<T> operator+(size_t off)
+	{
+		return buffer_ptr(ptr_ + off, node_);
+	}
+
+private:
+	T* ptr_;
+	node_t node_;
+};
+
+class node_descriptor
+{
+public:
+	const char* name() const { return name_; }
+private:
+	//std::string name_; // TODO(improvement): unify node description for all back-ends, NOTE: std::string is not trivally transferable
+	char name_[64]= "Node descriptions not available for TCP backend";
+
+	friend class net::communicator;
+};
+
+class communicator : public std::enable_shared_from_this<communicator> {
+public:
+	// externally used interface of request must be shared across all communicator-implementations
+	class request {
+	public:
+		request() : valid_(false), received_(false), sent_(false) {} // instantiate invalid
+		
+		request(node_t target_node, node_t source_node, size_t send_buffer_index, size_t recv_buffer_index)
+		 : target_node(target_node), source_node(source_node), valid_(true), sent_(false), received_(false), send_buffer_index(send_buffer_index), recv_buffer_index(recv_buffer_index), req_count(0)
+		{}
+
+		// return true if request was finished
+		bool test()
+		{
+            // tcp backend does not feature asynchronous operations yet
+            // HAM_DEBUG( HAM_LOG << "request::test(), TCP backend does not feature asynchronous operations" << std::endl; )
+
+			// int flag = 0;
+			// MPI_Testall(req_count, mpi_reqs, &flag, MPI_STATUS_IGNORE); // just test the receive request, since the send belonging to the request triggers the remote send that is received
+			return received_;
+		}
+
+		void* get() // blocks
+		{
+			// tcp backend does not feature asynchronous operations yet
+            // HAM_DEBUG( HAM_LOG << "request::get(), TCP backend does not feature asynchronous operations" << std::endl; )
+            // HAM_DEBUG( HAM_LOG << "request::get(), before MPI_Waitall()" << std::endl; )
+			// MPI_Waitall(req_count, mpi_reqs, MPI_STATUS_IGNORE); // must wait for all requests to satisfy the standard
+
+			// block until async receive handler reports completion
+			while(!received_);
+
+			return static_cast<void*>(&communicator::instance().peers[target_node].msg_buffers[recv_buffer_index]);
+		}
+
+		template<class T>
+		void send_result(T* result_msg, size_t size)
+		{
+			assert(communicator::this_node() == target_node); // this assert fails if send_result is called from the wrong side
+			
+			// TODO(improvement, low priority): better go through communicator, such that no MPI calls are anywhere else
+			// MPI_Send(result_msg, size, MPI_BYTE, source_node, constants::RESULT_TAG, MPI_COMM_WORLD);
+
+			communicator::instance().send_result(target_node, result_msg, size);
+            // don't need size * sizeof(T) because req.send_result is called as send_result((void*)&a, sizeof(a)) in offload_msg.hpp
+		}
+
+		bool valid() const
+		{
+			return valid_;
+		}
+
+		bool received() const {
+			return received_;
+		}
+
+		bool sent() const {
+			return sent_;
+		}
+
+        node_t target_node;
+		node_t source_node;
+		bool valid_;
+		bool received_; // used for the async receive handler to set to true, checked for completion
+		bool sent_; // used for the async send handler to set to true... unused, but the handler likes to do something
+
+		// only needed by the sender
+		enum { NUM_REQUESTS = 3 };
+		
+		size_t send_buffer_index; // buffer to use for sending the message
+		size_t recv_buffer_index; // buffer to use for receiving the result
+		size_t req_count;
+		
+	private:
+		// not needed since tcp backend does not offer async operations
+        // MPI_Request mpi_reqs[NUM_REQUESTS]; // for sending the msg, receiving the result, and an associated data transfer
+	}; // class request
+	
+	typedef request& request_reference_type;
+	typedef const request& request_const_reference_type;
+
+	communicator(int argc, char* argv[]) : node_desc_dummy()
+	{
+		HAM_DEBUG( HAM_LOG << "communicator::communicator(): initialising configuration" << std::endl; )
+
+		instance_ = this;
+
+		// command line configuration
+		nodes_ = 0;		// number of nodes
+		this_node_ = 0;		// "rank" of this node
+		this_port_ = 0;		// tcp port used for this node
+		host_node_ = 0;		// host node
+		host_address_ = "empty";		// host IP address or resolvable name
+		host_port_ = 0;		// host port
+
+
+		// command line options
+		boost::program_options::options_description desc("HAM Options");
+		desc.add_options()
+				("ham-help", "Shows this message")
+				("ham-process-count", boost::program_options::value(&nodes_)->required(), "Required: Number of processes the job consists of.")
+				("ham-address", boost::program_options::value(&this_node_)->required(), "Required: This processes UNIQUE address, between 0 and ham-process-count-1. 0 will make the process the host (required EXACTLY once). -1 will assign any free non-host rank.")
+				("ham-tcp-port", boost::program_options::value(&this_port_)->default_value(this_port_), "TCP port used if this process is a client. Default will auto select an available port. Host will use ham-tcp-hostport and ignore this.")
+				("ham-tcp-hostname", boost::program_options::value(&host_address_)->required(), "Required: IP address or resolvable hostname of the host process. Required. May be used on host to select interface.")
+				("ham-tcp-hostport", boost::program_options::value(&host_port_)->required(), "Required: TCP port used by the host.")
+				;
+
+		boost::program_options::variables_map vm;
+
+		const char* options_env = std::getenv("HAM_OPTIONS");
+		if (options_env)
+		{
+			char split_character = ' ';
+			if (std::getenv("HAM_OPTIONS_NO_SPACES")) // value does not matter
+				split_character = '_';
+
+			// parse from environment
+			boost::program_options::store(boost::program_options::command_line_parser(detail::options::split(std::string(options_env), split_character)).options(desc).allow_unregistered().run(), vm);
+		}
+		else
+		{
+			// parse from command line
+			boost::program_options::store(boost::program_options::command_line_parser(argc, argv).options(desc).allow_unregistered().run(), vm);
+		}
+
+		boost::program_options::notify(vm);
+
+		if(vm.count("ham-help"))
+		{
+			std::cout << desc << std::endl;
+			exit(0);
+		}
+
+		HAM_DEBUG( HAM_LOG << "communicator::communicator(): command line config:" << std::endl
+							 << "ham-process-count: " << nodes_ << std::endl
+							 << "ham-address: " << this_node_ << std::endl
+				   			 << "ham-tcp-port: " << this_port_ << std::endl
+							 << "ham-tcp-hostname: " << host_address_ << std::endl
+							 << "ham-tcp-hostport: " << host_port_ << std:: endl;
+  		)
+
+
+		HAM_DEBUG( HAM_LOG << "communicator::communicator(): initialising configuration done" << std::endl; )
+
+		HAM_DEBUG( HAM_LOG << "communicator::communicator(): connecting targets to host" << std::endl; )
+
+
+		// init peers structure
+		peers = new tcp_peer[nodes_];
+
+		// targets init tcp connection to host
+		if(!is_host()) {
+			tcp::socket sock(io_context); // socket is always stored with index = target node, so no "if_host" switching is necessary for functions executed on host and target
+			peers[host_node_].tcp_socket = &sock;
+            tcp::resolver resolver(io_context);
+			boost::asio::connect(*peers[host_node_].tcp_socket, resolver.resolve({host_address_, host_port_}));
+
+			// send requested rank to host
+			HAM_DEBUG( HAM_LOG << "communicator::communicator(): requesting ham-address " << this_node_ << "from host" << std::endl; )
+			boost::asio::write(peers[host_node_].tcp_socket, boost::asio::buffer((void*)&this_node_, sizeof(this_node_)));
+			// recv rank from host
+			boost::asio::read(peers[host_node_].tcp_socket, boost::asio::buffer((void*)&this_node_, sizeof(this_node_)));
+			HAM_DEBUG( HAM_LOG << "communicator::communicator(): received ham-address " << this_node_ << "from host" << std::endl; )
+		}
+
+		// host accepts tcp connection from targets
+		if(is_host()) {
+			tcp::resolver resolver(io_context);
+			tcp::resolver::query query(tcp::v4(), host_address_, host_port_);
+			tcp::resolver::iterator iter = resolver.resolve(query);
+			tcp::endpoint endpoint = iter->endpoint();
+			tcp::acceptor acc(io_context, endpoint);
+
+			node_t req_ranks[nodes_]; // store requested ranks in order of connection
+			tcp::socket temp_socks[nodes_]; // store sockets temporarily in connection order
+			bool taken_ranks[nodes_] {false};
+			taken_ranks[0] = true; // host rank has to be correctly provided and is therefore already taken (by the executing process)
+
+			for(int i=1; i < nodes_; i++) {
+				temp_socks[i] = acc.accept(); // accept connection
+
+				// recv rank
+				boost::asio::read(temp_socks[i], boost::asio::buffer((void *) &req_ranks[i], sizeof(node_t)));
+			}
+
+			// rearrange sockets and inform targets of resulting rank
+			for (int j = 1; j < nodes_; ++j) {
+				if(req_ranks[j] < -1 || req_ranks[j] > nodes_-1) { // check if rank invalid
+					std::cout << "communicator::communicator(): illegal ham-address requested:" << req_ranks[j] << std::endl;
+					exit(-1);
+				}else if(req_ranks[j] == -1) { // skip wildcard ranks, handled later to avoid conflicting ranks with following connects
+					HAM_DEBUG( HAM_LOG << "communicator::communicator(): connection " << j << " requested wildcard ham-address" << std::endl; )
+					continue;
+				}
+				if(taken_ranks[req_ranks[j]]) { // check if rank already taken
+					std::cout << "communicator::communicator(): ham-address requested more than once:" << req_ranks[j] << std::endl;
+					exit(-1);
+				} else {
+					node_t rrank = req_ranks[j];
+					HAM_DEBUG( HAM_LOG << "communicator::communicator(): connection " << j << " requested ham-address: " << rrank << std::endl; )
+					peers[rrank].tcp_socket = std::move(temp_socks[j]); // = move https://www.boost.org/doc/libs/1_65_0/doc/html/boost_asio/reference/basic_stream_socket/operator_eq_.html
+					taken_ranks[rrank] = true; // mark the requested rank as taken
+					HAM_DEBUG( HAM_LOG << "communicator::communicator(): associated ham-address: " << rrank << " with connection " << j << std::endl; )
+					// send assigned rank to target
+					boost::asio::write(peers[rrank].tcp_socket, boost::asio::buffer((void*)&rrank, sizeof(rrank)));
+				}
+			}
+
+			// handle wildcard ranks
+			for (int k = 1; k < nodes_; ++k) { // k is index to connections in connection order
+				if(req_ranks[k] == -1) { // find wildcard connections
+
+					for (int i = 1; i < nodes_; ++i) { // i is index to ranks in final rank order
+						if(!taken_ranks[i]) { // find a free rank
+							HAM_DEBUG( HAM_LOG << "communicator::communicator(): associating wildcard connection: " << k << " with ham-address " << i << std::endl; )
+							peers[i].tcp_socket = temp_socks[k];
+							taken_ranks[i] = true;
+							boost::asio::write(peers[i].tcp_socket, boost::asio::buffer((void*)&i, sizeof(i)));
+							break; // stop if free rank is assigned, go back to k-loop for next wildcard connection
+						}
+					}
+				}
+			}
+		}
+
+		HAM_DEBUG( HAM_LOG << "communicator::communicator(): connecting hosts done" << std::endl; )
+
+		// host init message buffers
+		if (is_host()) {
+			for (node_t i = 1; i < nodes_; ++i) { // TODO(improvement): needs to be changed when host-rank becomes configurable
+				// allocate buffers
+				peers[i].msg_buffers = allocate_buffer<msg_buffer>(constants::MSG_BUFFERS, this_node_);
+				// fill resource pools
+				for(size_t j = constants::MSG_BUFFERS; j > 0; --j) {
+					peers[i].buffer_pool.add(j-1);
+				}
+			}
+
+			// host runs io_context in separate thread (asynchronous progress thread) for async operations
+			boost::asio::io_service::work work(io_context);
+			std::thread thread([&io_context](){ io_context.run(); });
+		}
+
+
+
+	}
+
+	~communicator()
+	{
+		// finalize
+		if(is_host()) {
+			io_context.stop();
+		}
+		HAM_DEBUG( HAM_LOG << "~communicator" << std::endl; )
+	}
+
+
+	request allocate_request(node_t remote_node)
+	{
+		HAM_DEBUG( HAM_LOG << "communicator::allocate_next_request(): remote_node = " << remote_node << std::endl; )
+
+		const size_t send_buffer_index = peers[remote_node].buffer_pool.allocate();
+		const size_t recv_buffer_index = peers[remote_node].buffer_pool.allocate();
+
+		return { remote_node, this_node_, send_buffer_index, recv_buffer_index };
+	}
+
+	void free_request(request& req)
+	{
+		assert(req.valid());
+		assert(req.source_node == this_node_);
+	
+		tcp_peer& peer = peers[req.target_node];
+
+		peer.buffer_pool.free(req.send_buffer_index);
+		peer.buffer_pool.free(req.recv_buffer_index);
+		req.valid_ = false;
+	}
+
+public:
+
+	// called by host only
+	void send_msg(request_reference_type req, void* msg, size_t size)
+	{
+		// copy message from caller into transfer buffer
+		void* msg_buffer = static_cast<void*>(&peers[req.target_node].msg_buffers[req.send_buffer_index]);
+		memcpy(msg_buffer, msg, size);
+
+		// tcp write
+		auto self(shared_from_this());
+		boost::asio::async_write(peers[req.target_node].tcp_socket, boost::asio::buffer(msg_buffer, size),
+								[this, self, &req](boost::system::error_code ec, size_t length) {
+									req.sent_ = true;
+								}
+		);
+		// MPI_Isend(msg_buffer, size, MPI_BYTE, req.target_node, constants::DEFAULT_TAG, MPI_COMM_WORLD, &req.next_mpi_request());
+	}
+	
+	// to be used by the offload target's main loop: synchronously receive one message at a time
+	// NOTE: the local static receive buffer!
+	void* recv_msg_host(void* msg = nullptr, size_t size = constants::MSG_SIZE)
+	{
+		static msg_buffer buffer; // NOTE !
+		// MPI_Recv(&buffer, size, MPI_BYTE, host_node_, constants::DEFAULT_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+		boost::asio::read(peers[host_node_].tcp_socket, boost::asio::buffer(&buffer, size));
+        return static_cast<void*>(&buffer);
+	}
+
+    // send result through communicator
+    // only to be used by request.send_result()
+    template<class T>
+    void send_result(node_t target_node, T* message, size_t size) {
+
+        boost::asio::write(peers[target_node].tcp_socket, boost::asio::buffer((void*)message, size));
+    }
+
+	// trigger receiving the result of an active message on the host
+	void recv_result(request_reference_type req)
+	{
+		// tcp receive
+        auto self(shared_from_this());
+        boost::asio::async_read(peers[req.target_node].tcp_socket, boost::asio::buffer(static_cast<void*>(&peers[req.target_node].msg_buffers[req.recv_buffer_index]), constants::MSG_SIZE),
+				[this, self, &req](boost::system::error_code ec, size_t length) {
+					req.received_ = true;
+				}
+		);
+		// MPI_Irecv(static_cast<void*>(&peers[req.target_node].msg_buffers[req.recv_buffer_index]), constants::MSG_SIZE, MPI_BYTE, req.target_node, constants::RESULT_TAG, MPI_COMM_WORLD, &req.next_mpi_request());
+		return;
+	}
+
+	template<typename T>
+	void send_data(T* local_source, buffer_ptr<T> remote_dest, size_t size)
+	{
+		// tcp send
+
+        boost::asio::write(peers[remote_dest.node()].tcp_socket, boost::asio::buffer((void*)local_source, size * sizeof(T)));
+		// MPI_Send((void*)local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), constants::DATA_TAG, MPI_COMM_WORLD);
+	}
+
+	// to be used by the host
+	template<typename T>
+	void send_data_async(request_reference_type req, T* local_source, buffer_ptr<T> remote_dest, size_t size)
+	{
+		auto self(shared_from_this());
+		boost::asio::async_write(peers[remote_dest.node()].tcp_socket, boost::asio::buffer((void*)local_source, size*sizeof(T)),
+								 [this, self, &req](boost::system::error_code ec, size_t length) {
+									 req.sent_ = true;
+								 }
+		);
+		// MPI_Isend((void*)local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), constants::DATA_TAG, MPI_COMM_WORLD, &req.next_mpi_request());
+	}
+
+
+	template<typename T>
+	void recv_data(buffer_ptr<T> remote_source, T* local_dest, size_t size)
+	{
+		// tcp recv
+        boost::asio::read(peers[remote_source.node()].tpc_socket, boost::asio::buffer((void*)local_dest, size * sizeof(T)));
+		// MPI_Recv((void*)local_dest, size * sizeof(T), MPI_BYTE, remote_source.node(), constants::DATA_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+	}
+	
+	// to be used by the host
+	template<typename T>
+	void recv_data_async(request_reference_type req, buffer_ptr<T> remote_source, T* local_dest, size_t size)
+	{
+        auto self(shared_from_this());
+		boost::asio::async_read(peers[remote_source.node()].tpc_socket, boost::asio::buffer(static_cast<void*>(local_dest), size*sizeof(T)),
+								[this, self, &req](boost::system::error_code ec, size_t length) {
+									req.received_ = true;
+								}
+		);
+		// MPI_Irecv(static_cast<void*>(local_dest), size * sizeof(T), MPI_BYTE, remote_source.node(), constants::DATA_TAG, MPI_COMM_WORLD, &req.next_mpi_request());
+	}
+
+	template<typename T>
+	buffer_ptr<T> allocate_buffer(const size_t n, node_t source_node)
+	{
+		T* ptr;
+		//int err =
+		posix_memalign((void**)&ptr, constants::CACHE_LINE_SIZE, n * sizeof(T));
+		// NOTE: no ctor is called
+		return buffer_ptr<T>(ptr, this_node_);
+	}
+
+	template<typename T>
+	void free_buffer(buffer_ptr<T> ptr)
+	{
+		assert(ptr.node() == this_node_);
+		// NOTE: no dtor is called
+		free(static_cast<void*>(ptr.get()));
+	}
+
+	static communicator& instance() { return *instance_; }
+	static node_t this_node() { return instance().this_node_; }
+	static size_t num_nodes() { return instance().nodes_; }
+	bool is_host() { return this_node_ == 0; } // TODO(improvement): ham_address == ham_host_address ; }
+	bool is_host(node_t node) { return node == 0; } // TODO(improvement): node == ham_host_address; }
+
+	static const node_descriptor& get_node_description(node_t node)
+	{
+        return instance().node_desc_dummy;
+	}
+
+private:
+	static communicator* instance_;
+	size_t nodes_;
+	node_t this_node_;
+	int this_port_;
+	node_t host_node_;
+	std::string host_address_;
+	int host_port_;
+    node_descriptor node_desc_dummy;
+	boost::asio::io_service io_context;
+		
+	struct tcp_peer {
+		buffer_ptr<msg_buffer> msg_buffers; // buffers used for MPI_ISend and IRecv by the sender
+
+		// needed by sender to manage which buffers are in use and which are free
+		// just manages indices, that can be used by
+		detail::resource_pool<size_t> buffer_pool;
+
+		// tcp socket
+		tcp::socket* tcp_socket;
+	};
+	
+	tcp_peer* peers;
+};
+
+template<typename T>
+buffer_ptr<T>::buffer_ptr() : buffer_ptr(nullptr, communicator::this_node()) { }
+
+template<typename T>
+T& buffer_ptr<T>::operator[](size_t i)
+{
+	assert(node_ == communicator::this_node());
+	return ptr_[i];
+}
+
+} // namespace net
+} // namespace ham
+
+#endif // ham_net_communicator_tcp_hpp
diff --git a/include/ham/offload/offload.hpp b/include/ham/offload/offload.hpp
index 0148e7f..58e7e19 100644
--- a/include/ham/offload/offload.hpp
+++ b/include/ham/offload/offload.hpp
@@ -223,7 +223,7 @@ future<void> put(T* local_source, buffer_ptr<T>& remote_dest, size_t n)
 	// TODO(improvement): create a data transfer thread for one-sided
 	comm.send_data(local_source, remote_dest, n); // sync
 	return future<void>(true); // return dummy future
-#elif defined HAM_COMM_MPI
+#elif defined(HAM_COMM_MPI) || defined(HAM_COMM_TCP)
 	// allocate a request and construct a future
 	future<void> result(comm.allocate_request(remote_dest.node()));
 	// generate an offload message
@@ -261,7 +261,7 @@ future<void> get(buffer_ptr<T> remote_source, T* local_dest, size_t n)
 	// TODO(improvement): create a data transfer thread for one-sided
 	comm.recv_data(remote_source, local_dest, n); // sync
 	return future<void>(true); // return dummy future
-#elif defined HAM_COMM_MPI
+#elif defined(HAM_COMM_MPI) || defined(HAM_COMM_TCP)
 	// allocate a request and construct a future
 	future<void> result(comm.allocate_request(remote_source.node()));
 	// generate an offload message
@@ -271,7 +271,6 @@ future<void> get(buffer_ptr<T> remote_source, T* local_dest, size_t n)
 	comm.recv_data_async(result.get_request(), remote_source, local_dest, n);
 	comm.recv_result(result.get_request()); // trigger receiving the result
 	// TODO(improvement): the recv_result() is not needed, could remove and remove send_result() from offload_read_msg to reduce synchronization overhead
-
 	return result;
 #elif defined HAM_COMM_MPI_RMA_DYNAMIC
 	future<void> result(comm.allocate_data_request(remote_source.node()));
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index b34c36e..8dbb21b 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -11,6 +11,11 @@ add_subdirectory(ham)
 ### Benchmarks
 
 ## Explicit targets (not built by default)
+
+# TCP benchmarks
+add_executable(benchmark_ham_offload_tcp benchmark_ham_offload.cpp)
+target_link_libraries(benchmark_ham_offload_tcp ham_offload_tcp)
+
 # Intel LEO offload directive benchmark, requires Intel compiler
 if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
 	add_executable(benchmark_intel_leo EXCLUDE_FROM_ALL benchmark_intel_leo.cpp)
@@ -36,7 +41,21 @@ endif()
 add_executable(active_msgs active_msgs.cpp)
 target_link_libraries(active_msgs ham_interface)
 
+# TCP tests
+add_executable(ham_offload_test_tcp ham_offload.cpp)
+target_link_libraries(ham_offload_test_tcp ham_offload_tcp)
+
+add_executable(inner_product_tcp inner_product.cpp)
+target_link_libraries(inner_product_tcp ham_offload_tcp)
+
+add_executable(test_data_transfer_tcp test_data_transfer.cpp)
+target_link_libraries(test_data_transfer_tcp ham_offload_tcp)
+
+add_executable(test_argument_transfer_tcp test_argument_transfer.cpp)
+target_link_libraries(test_argument_transfer_tcp ham_offload_tcp)
+
 if (MPI_FOUND)
+# two-sided MPI
 	add_executable(ham_offload_test_mpi ham_offload.cpp)
 	target_link_libraries(ham_offload_test_mpi ham_offload_mpi)
 
@@ -52,6 +71,8 @@ if (MPI_FOUND)
 	add_executable(test_argument_transfer_mpi test_argument_transfer.cpp)
 	target_link_libraries(test_argument_transfer_mpi ham_offload_mpi)
 
+# RMA MPI
+
 	add_executable(ham_offload_test_mpi_rma_dynamic ham_offload.cpp)
 	target_link_libraries(ham_offload_test_mpi_rma_dynamic ham_offload_mpi_rma_dynamic)
 
@@ -63,6 +84,7 @@ if (MPI_FOUND)
 
 	add_executable(test_argument_transfer_mpi_rma_dynamic test_argument_transfer.cpp)
 	target_link_libraries(test_argument_transfer_mpi_rma_dynamic ham_offload_mpi_rma_dynamic)
+
 endif()
 
 if (SCIF_FOUND)
diff --git a/src/ham/CMakeLists.txt b/src/ham/CMakeLists.txt
index 278d452..cc0df10 100644
--- a/src/ham/CMakeLists.txt
+++ b/src/ham/CMakeLists.txt
@@ -8,7 +8,7 @@ cmake_minimum_required(VERSION 3.2 FATAL_ERROR) # TODO verfify
 
 # interface target for ham
 add_library(ham_interface INTERFACE)
-target_compile_features(ham_interface INTERFACE cxx_auto_type cxx_range_for cxx_variadic_templates)
+target_compile_features(ham_interface INTERFACE )
 target_link_libraries(ham_interface INTERFACE noma_bmt boost_library)
 target_include_directories(ham_interface INTERFACE ${CMAKE_CURRENT_LIST_DIR}/../../include)
 
@@ -20,6 +20,22 @@ set(HAM_LIB_SRC
 	offload/offload.cpp
 	util/cpu_affinity.cpp)
 
+# TCP
+add_library(ham_offload_tcp # SHARED if BUILD_SHARED_LIBS = TRUE
+		net/communicator.cpp
+		net/communicator_tcp.cpp
+		offload/runtime.cpp
+		offload/offload.cpp
+		offload/main.cpp
+		util/cpu_affinity.cpp)
+target_compile_definitions(ham_offload_tcp PUBLIC -DHAM_COMM_TCP=1)
+target_link_libraries(ham_offload_tcp PUBLIC ham_interface boost_library)
+
+set_target_properties(ham_offload_tcp PROPERTIES
+		CXX_STANDARD 11
+		CXX_STANDARD_REQUIRED YES
+		CXX_EXTENSIONS NO)
+
 if (MPI_FOUND)
 	add_library(ham_offload_mpi # SHARED if BUILD_SHARED_LIBS = TRUE
 	            ${HAM_LIB_SRC}
diff --git a/src/ham/net/communicator_tcp.cpp b/src/ham/net/communicator_tcp.cpp
new file mode 100644
index 0000000..e4e5dbd
--- /dev/null
+++ b/src/ham/net/communicator_tcp.cpp
@@ -0,0 +1,9 @@
+// Copyright (c) 2013-2014 Matthias Noack (ma.noack.pr@gmail.com)
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include "ham/net/communicator.hpp"
+
+ham::net::communicator* ham::net::communicator::instance_ = nullptr;
+

From 7f778bbc9d504b615ee5bab51ab9ee02e3994706 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Wed, 31 Oct 2018 15:35:55 +0100
Subject: [PATCH 072/150] changed client connection

---
 include/ham/net/communicator_tcp.hpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index 66d4a55..defa070 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -212,10 +212,12 @@ class communicator : public std::enable_shared_from_this<communicator> {
 
 		// targets init tcp connection to host
 		if(!is_host()) {
-			tcp::socket sock(io_context); // socket is always stored with index = target node, so no "if_host" switching is necessary for functions executed on host and target
-			peers[host_node_].tcp_socket = &sock;
+            tcp::socket sock(io_context); // socket is always stored with index = target node, so no "if_host" switching is necessary for functions executed on host and target
+            peers[host_node_].tcp_socket = &sock;
             tcp::resolver resolver(io_context);
-			boost::asio::connect(*peers[host_node_].tcp_socket, resolver.resolve({host_address_, host_port_}));
+            tcp::resolver::query query(tcp::v4(), host_address_, host_port_);
+            tcp::resolver::iterator iter = resolver.resolve(query);
+			boost::asio::connect(*peers[host_node_].tcp_socket, iter);
 
 			// send requested rank to host
 			HAM_DEBUG( HAM_LOG << "communicator::communicator(): requesting ham-address " << this_node_ << "from host" << std::endl; )

From 9201b23dbabb4df8e30235176836e9bf7b55f904 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Wed, 31 Oct 2018 15:53:22 +0100
Subject: [PATCH 073/150] changed client connection

---
 include/ham/net/communicator_tcp.hpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index defa070..09ace40 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -153,7 +153,7 @@ class communicator : public std::enable_shared_from_this<communicator> {
 		this_port_ = 0;		// tcp port used for this node
 		host_node_ = 0;		// host node
 		host_address_ = "empty";		// host IP address or resolvable name
-		host_port_ = 0;		// host port
+		host_port_ = "empty";		// host port
 
 
 		// command line options
@@ -215,9 +215,9 @@ class communicator : public std::enable_shared_from_this<communicator> {
             tcp::socket sock(io_context); // socket is always stored with index = target node, so no "if_host" switching is necessary for functions executed on host and target
             peers[host_node_].tcp_socket = &sock;
             tcp::resolver resolver(io_context);
-            tcp::resolver::query query(tcp::v4(), host_address_, host_port_);
-            tcp::resolver::iterator iter = resolver.resolve(query);
-			boost::asio::connect(*peers[host_node_].tcp_socket, iter);
+            //tcp::resolver::query query(tcp::v4(), host_address_, host_port_);
+            //tcp::resolver::iterator iter = resolver.resolve(query);
+			boost::asio::connect(*peers[host_node_].tcp_socket, resolver.resolve(&host_address_, &host_port_));
 
 			// send requested rank to host
 			HAM_DEBUG( HAM_LOG << "communicator::communicator(): requesting ham-address " << this_node_ << "from host" << std::endl; )
@@ -472,7 +472,7 @@ class communicator : public std::enable_shared_from_this<communicator> {
 	int this_port_;
 	node_t host_node_;
 	std::string host_address_;
-	int host_port_;
+	std::string host_port_;
     node_descriptor node_desc_dummy;
 	boost::asio::io_service io_context;
 		

From f894ec1c8f66a7c6affc30f0cea9a8964fb9d0e2 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Wed, 31 Oct 2018 17:21:07 +0100
Subject: [PATCH 074/150] bla

---
 include/ham/net/communicator_tcp.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index 09ace40..d6805df 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -217,7 +217,7 @@ class communicator : public std::enable_shared_from_this<communicator> {
             tcp::resolver resolver(io_context);
             //tcp::resolver::query query(tcp::v4(), host_address_, host_port_);
             //tcp::resolver::iterator iter = resolver.resolve(query);
-			boost::asio::connect(*peers[host_node_].tcp_socket, resolver.resolve(&host_address_, &host_port_));
+			boost::asio::connect(*peers[host_node_].tcp_socket, resolver.resolve(host_address_, host_port_));
 
 			// send requested rank to host
 			HAM_DEBUG( HAM_LOG << "communicator::communicator(): requesting ham-address " << this_node_ << "from host" << std::endl; )

From da62e5901f8351149f8bdfa7617f1b892e643d1a Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Wed, 31 Oct 2018 17:25:01 +0100
Subject: [PATCH 075/150] bla

---
 include/ham/net/communicator_tcp.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index d6805df..bf8fe99 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -236,7 +236,7 @@ class communicator : public std::enable_shared_from_this<communicator> {
 			tcp::acceptor acc(io_context, endpoint);
 
 			node_t req_ranks[nodes_]; // store requested ranks in order of connection
-			tcp::socket temp_socks[nodes_]; // store sockets temporarily in connection order
+			tcp::socket* temp_socks = new socket[node_](io_context); // store sockets temporarily in connection order
 			bool taken_ranks[nodes_] {false};
 			taken_ranks[0] = true; // host rank has to be correctly provided and is therefore already taken (by the executing process)
 

From 1d8bb99a6971418dc07bc35da94f171af9d403f6 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Wed, 31 Oct 2018 17:26:33 +0100
Subject: [PATCH 076/150] bla

---
 include/ham/net/communicator_tcp.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index bf8fe99..4cbd8c4 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -236,7 +236,7 @@ class communicator : public std::enable_shared_from_this<communicator> {
 			tcp::acceptor acc(io_context, endpoint);
 
 			node_t req_ranks[nodes_]; // store requested ranks in order of connection
-			tcp::socket* temp_socks = new socket[node_](io_context); // store sockets temporarily in connection order
+			tcp::socket* temp_socks = new tcp::socket[node_](io_context); // store sockets temporarily in connection order
 			bool taken_ranks[nodes_] {false};
 			taken_ranks[0] = true; // host rank has to be correctly provided and is therefore already taken (by the executing process)
 

From 41b9a7f5d84edba739dc877bca9afc44eef5ee16 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Wed, 31 Oct 2018 17:29:30 +0100
Subject: [PATCH 077/150] bla

---
 include/ham/net/communicator_tcp.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index 4cbd8c4..5de053b 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -236,7 +236,7 @@ class communicator : public std::enable_shared_from_this<communicator> {
 			tcp::acceptor acc(io_context, endpoint);
 
 			node_t req_ranks[nodes_]; // store requested ranks in order of connection
-			tcp::socket* temp_socks = new tcp::socket[node_](io_context); // store sockets temporarily in connection order
+			tcp::socket* temp_socks = new tcp::socket[nodes_](io_context); // store sockets temporarily in connection order
 			bool taken_ranks[nodes_] {false};
 			taken_ranks[0] = true; // host rank has to be correctly provided and is therefore already taken (by the executing process)
 

From 5b2df19cc783ecb850ffb76749c7187c3228a52d Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Wed, 31 Oct 2018 17:39:50 +0100
Subject: [PATCH 078/150] bla

---
 include/ham/net/communicator_tcp.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index 5de053b..cb3745a 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -262,7 +262,7 @@ class communicator : public std::enable_shared_from_this<communicator> {
 				} else {
 					node_t rrank = req_ranks[j];
 					HAM_DEBUG( HAM_LOG << "communicator::communicator(): connection " << j << " requested ham-address: " << rrank << std::endl; )
-					peers[rrank].tcp_socket = std::move(temp_socks[j]); // = move https://www.boost.org/doc/libs/1_65_0/doc/html/boost_asio/reference/basic_stream_socket/operator_eq_.html
+					peers[rrank].tcp_socket = &std::move(temp_socks[j]); // = move https://www.boost.org/doc/libs/1_65_0/doc/html/boost_asio/reference/basic_stream_socket/operator_eq_.html
 					taken_ranks[rrank] = true; // mark the requested rank as taken
 					HAM_DEBUG( HAM_LOG << "communicator::communicator(): associated ham-address: " << rrank << " with connection " << j << std::endl; )
 					// send assigned rank to target
@@ -302,7 +302,7 @@ class communicator : public std::enable_shared_from_this<communicator> {
 
 			// host runs io_context in separate thread (asynchronous progress thread) for async operations
 			boost::asio::io_service::work work(io_context);
-			std::thread thread([&io_context](){ io_context.run(); });
+			std::thread thread([this](){ io_context.run(); });
 		}
 
 

From c708644c635f8b893e24c50c5ba5c644b35db515 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Wed, 31 Oct 2018 17:47:14 +0100
Subject: [PATCH 079/150] bla

---
 include/ham/net/communicator_tcp.hpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index cb3745a..98d8bc0 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -212,8 +212,8 @@ class communicator : public std::enable_shared_from_this<communicator> {
 
 		// targets init tcp connection to host
 		if(!is_host()) {
-            tcp::socket sock(io_context); // socket is always stored with index = target node, so no "if_host" switching is necessary for functions executed on host and target
-            peers[host_node_].tcp_socket = &sock;
+            tcp::socket sock = new tcp::socket(io_context); // socket is always stored with index = target node, so no "if_host" switching is necessary for functions executed on host and target
+            peers[host_node_].tcp_socket = sock;
             tcp::resolver resolver(io_context);
             //tcp::resolver::query query(tcp::v4(), host_address_, host_port_);
             //tcp::resolver::iterator iter = resolver.resolve(query);
@@ -266,7 +266,7 @@ class communicator : public std::enable_shared_from_this<communicator> {
 					taken_ranks[rrank] = true; // mark the requested rank as taken
 					HAM_DEBUG( HAM_LOG << "communicator::communicator(): associated ham-address: " << rrank << " with connection " << j << std::endl; )
 					// send assigned rank to target
-					boost::asio::write(peers[rrank].tcp_socket, boost::asio::buffer((void*)&rrank, sizeof(rrank)));
+					boost::asio::write(*peers[rrank].tcp_socket, boost::asio::buffer((void*)&rrank, sizeof(rrank)));
 				}
 			}
 
@@ -279,7 +279,7 @@ class communicator : public std::enable_shared_from_this<communicator> {
 							HAM_DEBUG( HAM_LOG << "communicator::communicator(): associating wildcard connection: " << k << " with ham-address " << i << std::endl; )
 							peers[i].tcp_socket = temp_socks[k];
 							taken_ranks[i] = true;
-							boost::asio::write(peers[i].tcp_socket, boost::asio::buffer((void*)&i, sizeof(i)));
+							boost::asio::write(*peers[i].tcp_socket, boost::asio::buffer((void*)&i, sizeof(i)));
 							break; // stop if free rank is assigned, go back to k-loop for next wildcard connection
 						}
 					}
@@ -484,7 +484,7 @@ class communicator : public std::enable_shared_from_this<communicator> {
 		detail::resource_pool<size_t> buffer_pool;
 
 		// tcp socket
-		tcp::socket* tcp_socket;
+		tcp::socket tcp_socket;
 	};
 	
 	tcp_peer* peers;

From c423ad096936376e14d3d4553383e1094f3cb8a5 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Wed, 31 Oct 2018 18:09:35 +0100
Subject: [PATCH 080/150] bla

---
 include/ham/net/communicator_tcp.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index 98d8bc0..a7d7b8f 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -236,12 +236,12 @@ class communicator : public std::enable_shared_from_this<communicator> {
 			tcp::acceptor acc(io_context, endpoint);
 
 			node_t req_ranks[nodes_]; // store requested ranks in order of connection
-			tcp::socket* temp_socks = new tcp::socket[nodes_](io_context); // store sockets temporarily in connection order
+			tcp::socket* temp_socks[nodes_]; // store sockets temporarily in connection order
 			bool taken_ranks[nodes_] {false};
 			taken_ranks[0] = true; // host rank has to be correctly provided and is therefore already taken (by the executing process)
 
 			for(int i=1; i < nodes_; i++) {
-				temp_socks[i] = acc.accept(); // accept connection
+				 acc.accept(temp_socks[i]); // accept connection
 
 				// recv rank
 				boost::asio::read(temp_socks[i], boost::asio::buffer((void *) &req_ranks[i], sizeof(node_t)));
@@ -484,7 +484,7 @@ class communicator : public std::enable_shared_from_this<communicator> {
 		detail::resource_pool<size_t> buffer_pool;
 
 		// tcp socket
-		tcp::socket tcp_socket;
+		tcp::socket* tcp_socket;
 	};
 	
 	tcp_peer* peers;

From e2e35df574badc9313279b3c25f6ecabb8b9041d Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Wed, 31 Oct 2018 18:14:30 +0100
Subject: [PATCH 081/150] bla

---
 include/ham/net/communicator_tcp.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index a7d7b8f..479e33e 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -212,7 +212,7 @@ class communicator : public std::enable_shared_from_this<communicator> {
 
 		// targets init tcp connection to host
 		if(!is_host()) {
-            tcp::socket sock = new tcp::socket(io_context); // socket is always stored with index = target node, so no "if_host" switching is necessary for functions executed on host and target
+            tcp::socket* sock = new tcp::socket(io_context); // socket is always stored with index = target node, so no "if_host" switching is necessary for functions executed on host and target
             peers[host_node_].tcp_socket = sock;
             tcp::resolver resolver(io_context);
             //tcp::resolver::query query(tcp::v4(), host_address_, host_port_);
@@ -241,7 +241,7 @@ class communicator : public std::enable_shared_from_this<communicator> {
 			taken_ranks[0] = true; // host rank has to be correctly provided and is therefore already taken (by the executing process)
 
 			for(int i=1; i < nodes_; i++) {
-				 acc.accept(temp_socks[i]); // accept connection
+				 acc.accept(*temp_socks[i]); // accept connection
 
 				// recv rank
 				boost::asio::read(temp_socks[i], boost::asio::buffer((void *) &req_ranks[i], sizeof(node_t)));

From 50eb7cc650539fabc63f81456c85fe2e7be93131 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Wed, 31 Oct 2018 18:17:06 +0100
Subject: [PATCH 082/150] bla

---
 include/ham/net/communicator_tcp.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index 479e33e..1d07be3 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -244,7 +244,7 @@ class communicator : public std::enable_shared_from_this<communicator> {
 				 acc.accept(*temp_socks[i]); // accept connection
 
 				// recv rank
-				boost::asio::read(temp_socks[i], boost::asio::buffer((void *) &req_ranks[i], sizeof(node_t)));
+				boost::asio::read(*temp_socks[i], boost::asio::buffer((void *) &req_ranks[i], sizeof(node_t)));
 			}
 
 			// rearrange sockets and inform targets of resulting rank
@@ -262,7 +262,7 @@ class communicator : public std::enable_shared_from_this<communicator> {
 				} else {
 					node_t rrank = req_ranks[j];
 					HAM_DEBUG( HAM_LOG << "communicator::communicator(): connection " << j << " requested ham-address: " << rrank << std::endl; )
-					peers[rrank].tcp_socket = &std::move(temp_socks[j]); // = move https://www.boost.org/doc/libs/1_65_0/doc/html/boost_asio/reference/basic_stream_socket/operator_eq_.html
+					peers[rrank].tcp_socket = temp_socks[j]; // = move https://www.boost.org/doc/libs/1_65_0/doc/html/boost_asio/reference/basic_stream_socket/operator_eq_.html
 					taken_ranks[rrank] = true; // mark the requested rank as taken
 					HAM_DEBUG( HAM_LOG << "communicator::communicator(): associated ham-address: " << rrank << " with connection " << j << std::endl; )
 					// send assigned rank to target

From 1828a78d86a303ce47a3e962e908602c23d863fb Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Wed, 31 Oct 2018 18:20:30 +0100
Subject: [PATCH 083/150] bla

---
 include/ham/net/communicator_tcp.hpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index 1d07be3..1bc5d1d 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -221,9 +221,9 @@ class communicator : public std::enable_shared_from_this<communicator> {
 
 			// send requested rank to host
 			HAM_DEBUG( HAM_LOG << "communicator::communicator(): requesting ham-address " << this_node_ << "from host" << std::endl; )
-			boost::asio::write(peers[host_node_].tcp_socket, boost::asio::buffer((void*)&this_node_, sizeof(this_node_)));
+			boost::asio::write(*peers[host_node_].tcp_socket, boost::asio::buffer((void*)&this_node_, sizeof(this_node_)));
 			// recv rank from host
-			boost::asio::read(peers[host_node_].tcp_socket, boost::asio::buffer((void*)&this_node_, sizeof(this_node_)));
+			boost::asio::read(*peers[host_node_].tcp_socket, boost::asio::buffer((void*)&this_node_, sizeof(this_node_)));
 			HAM_DEBUG( HAM_LOG << "communicator::communicator(): received ham-address " << this_node_ << "from host" << std::endl; )
 		}
 
@@ -352,7 +352,7 @@ class communicator : public std::enable_shared_from_this<communicator> {
 
 		// tcp write
 		auto self(shared_from_this());
-		boost::asio::async_write(peers[req.target_node].tcp_socket, boost::asio::buffer(msg_buffer, size),
+		boost::asio::async_write(*peers[req.target_node].tcp_socket, boost::asio::buffer(msg_buffer, size),
 								[this, self, &req](boost::system::error_code ec, size_t length) {
 									req.sent_ = true;
 								}
@@ -366,7 +366,7 @@ class communicator : public std::enable_shared_from_this<communicator> {
 	{
 		static msg_buffer buffer; // NOTE !
 		// MPI_Recv(&buffer, size, MPI_BYTE, host_node_, constants::DEFAULT_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-		boost::asio::read(peers[host_node_].tcp_socket, boost::asio::buffer(&buffer, size));
+		boost::asio::read(*peers[host_node_].tcp_socket, boost::asio::buffer(&buffer, size));
         return static_cast<void*>(&buffer);
 	}
 
@@ -375,7 +375,7 @@ class communicator : public std::enable_shared_from_this<communicator> {
     template<class T>
     void send_result(node_t target_node, T* message, size_t size) {
 
-        boost::asio::write(peers[target_node].tcp_socket, boost::asio::buffer((void*)message, size));
+        boost::asio::write(*peers[target_node].tcp_socket, boost::asio::buffer((void*)message, size));
     }
 
 	// trigger receiving the result of an active message on the host
@@ -383,7 +383,7 @@ class communicator : public std::enable_shared_from_this<communicator> {
 	{
 		// tcp receive
         auto self(shared_from_this());
-        boost::asio::async_read(peers[req.target_node].tcp_socket, boost::asio::buffer(static_cast<void*>(&peers[req.target_node].msg_buffers[req.recv_buffer_index]), constants::MSG_SIZE),
+        boost::asio::async_read(*peers[req.target_node].tcp_socket, boost::asio::buffer(static_cast<void*>(&peers[req.target_node].msg_buffers[req.recv_buffer_index]), constants::MSG_SIZE),
 				[this, self, &req](boost::system::error_code ec, size_t length) {
 					req.received_ = true;
 				}
@@ -397,7 +397,7 @@ class communicator : public std::enable_shared_from_this<communicator> {
 	{
 		// tcp send
 
-        boost::asio::write(peers[remote_dest.node()].tcp_socket, boost::asio::buffer((void*)local_source, size * sizeof(T)));
+        boost::asio::write(*peers[remote_dest.node()].tcp_socket, boost::asio::buffer((void*)local_source, size * sizeof(T)));
 		// MPI_Send((void*)local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), constants::DATA_TAG, MPI_COMM_WORLD);
 	}
 
@@ -406,7 +406,7 @@ class communicator : public std::enable_shared_from_this<communicator> {
 	void send_data_async(request_reference_type req, T* local_source, buffer_ptr<T> remote_dest, size_t size)
 	{
 		auto self(shared_from_this());
-		boost::asio::async_write(peers[remote_dest.node()].tcp_socket, boost::asio::buffer((void*)local_source, size*sizeof(T)),
+		boost::asio::async_write(*peers[remote_dest.node()].tcp_socket, boost::asio::buffer((void*)local_source, size*sizeof(T)),
 								 [this, self, &req](boost::system::error_code ec, size_t length) {
 									 req.sent_ = true;
 								 }

From df30d5b12a37c92a9bfb687228842c6a80d524b7 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Wed, 31 Oct 2018 18:23:38 +0100
Subject: [PATCH 084/150] bla

---
 include/ham/offload/offload_msg.hpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/ham/offload/offload_msg.hpp b/include/ham/offload/offload_msg.hpp
index 6e709a4..01f4e9d 100644
--- a/include/ham/offload/offload_msg.hpp
+++ b/include/ham/offload/offload_msg.hpp
@@ -6,7 +6,9 @@
 #ifndef ham_offload_offload_msg_hpp
 #define ham_offload_offload_msg_hpp
 
+#ifdef HAM_COMM_MPI_RMA_DYNAMIC
 #include <mpi.h>
+#endif
 #include "ham/msg/active_msg.hpp"
 #include "ham/msg/execution_policy.hpp"
 #include "ham/misc/constants.hpp"

From 2b778edc8913e566458132ec5c452650296e26ae Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Wed, 31 Oct 2018 18:27:53 +0100
Subject: [PATCH 085/150] fixed spelling mistake

---
 include/ham/net/communicator_tcp.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index 1bc5d1d..4934e9d 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -419,7 +419,7 @@ class communicator : public std::enable_shared_from_this<communicator> {
 	void recv_data(buffer_ptr<T> remote_source, T* local_dest, size_t size)
 	{
 		// tcp recv
-        boost::asio::read(peers[remote_source.node()].tpc_socket, boost::asio::buffer((void*)local_dest, size * sizeof(T)));
+        boost::asio::read(peers[remote_source.node()].tcp_socket, boost::asio::buffer((void*)local_dest, size * sizeof(T)));
 		// MPI_Recv((void*)local_dest, size * sizeof(T), MPI_BYTE, remote_source.node(), constants::DATA_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
 	}
 	

From be3c8ccfeae6aeda02f769cb4faa1592e0385244 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Wed, 31 Oct 2018 18:29:49 +0100
Subject: [PATCH 086/150] fixed spelling mistake

---
 include/ham/net/communicator_tcp.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index 4934e9d..772578f 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -419,7 +419,7 @@ class communicator : public std::enable_shared_from_this<communicator> {
 	void recv_data(buffer_ptr<T> remote_source, T* local_dest, size_t size)
 	{
 		// tcp recv
-        boost::asio::read(peers[remote_source.node()].tcp_socket, boost::asio::buffer((void*)local_dest, size * sizeof(T)));
+        boost::asio::read(*peers[remote_source.node()].tcp_socket, boost::asio::buffer((void*)local_dest, size * sizeof(T)));
 		// MPI_Recv((void*)local_dest, size * sizeof(T), MPI_BYTE, remote_source.node(), constants::DATA_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
 	}
 	
@@ -428,7 +428,7 @@ class communicator : public std::enable_shared_from_this<communicator> {
 	void recv_data_async(request_reference_type req, buffer_ptr<T> remote_source, T* local_dest, size_t size)
 	{
         auto self(shared_from_this());
-		boost::asio::async_read(peers[remote_source.node()].tpc_socket, boost::asio::buffer(static_cast<void*>(local_dest), size*sizeof(T)),
+		boost::asio::async_read(*peers[remote_source.node()].tcp_socket, boost::asio::buffer(static_cast<void*>(local_dest), size*sizeof(T)),
 								[this, self, &req](boost::system::error_code ec, size_t length) {
 									req.received_ = true;
 								}

From a9408d37fd2f21341b4e106d551733102e866003 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Wed, 31 Oct 2018 18:51:25 +0100
Subject: [PATCH 087/150] =?UTF-8?q?trying=20something=20dumb=20to=20preven?=
 =?UTF-8?q?t=20a=C3=B6ready=20open=20error?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 include/ham/net/communicator_tcp.hpp | 1 +
 src/ham/CMakeLists.txt               | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index 772578f..bda7c0e 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -241,6 +241,7 @@ class communicator : public std::enable_shared_from_this<communicator> {
 			taken_ranks[0] = true; // host rank has to be correctly provided and is therefore already taken (by the executing process)
 
 			for(int i=1; i < nodes_; i++) {
+                 temp_socks->close();
 				 acc.accept(*temp_socks[i]); // accept connection
 
 				// recv rank
diff --git a/src/ham/CMakeLists.txt b/src/ham/CMakeLists.txt
index cc0df10..1652e1c 100644
--- a/src/ham/CMakeLists.txt
+++ b/src/ham/CMakeLists.txt
@@ -29,7 +29,7 @@ add_library(ham_offload_tcp # SHARED if BUILD_SHARED_LIBS = TRUE
 		offload/main.cpp
 		util/cpu_affinity.cpp)
 target_compile_definitions(ham_offload_tcp PUBLIC -DHAM_COMM_TCP=1)
-target_link_libraries(ham_offload_tcp PUBLIC ham_interface boost_library)
+target_link_libraries(ham_offload_tcp PUBLIC ham_interface boost_library pthread)
 
 set_target_properties(ham_offload_tcp PROPERTIES
 		CXX_STANDARD 11

From 866de30fdcc899ab9395c24cea05f12451b38861 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Wed, 31 Oct 2018 18:54:47 +0100
Subject: [PATCH 088/150] =?UTF-8?q?trying=20something=20dumb=20to=20preven?=
 =?UTF-8?q?t=20a=C3=B6ready=20open=20error?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 include/ham/net/communicator_tcp.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index bda7c0e..1dffd27 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -241,7 +241,7 @@ class communicator : public std::enable_shared_from_this<communicator> {
 			taken_ranks[0] = true; // host rank has to be correctly provided and is therefore already taken (by the executing process)
 
 			for(int i=1; i < nodes_; i++) {
-                 temp_socks->close();
+                 temp_socks[i]->close();
 				 acc.accept(*temp_socks[i]); // accept connection
 
 				// recv rank

From 3c00f53c971e910f5a56dbbfcb89c9fc6d47bcdc Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Wed, 31 Oct 2018 18:58:17 +0100
Subject: [PATCH 089/150] trying something less dumb to prevent already open
 error

---
 include/ham/net/communicator_tcp.hpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index 1dffd27..436aba0 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -237,11 +237,15 @@ class communicator : public std::enable_shared_from_this<communicator> {
 
 			node_t req_ranks[nodes_]; // store requested ranks in order of connection
 			tcp::socket* temp_socks[nodes_]; // store sockets temporarily in connection order
+            for (int l = 1; l < nodes_; ++l) {
+                temp_socks[l] = new tcp::socket(io_context);
+            }
+
 			bool taken_ranks[nodes_] {false};
 			taken_ranks[0] = true; // host rank has to be correctly provided and is therefore already taken (by the executing process)
 
 			for(int i=1; i < nodes_; i++) {
-                 temp_socks[i]->close();
+                 // temp_socks[i]->close();
 				 acc.accept(*temp_socks[i]); // accept connection
 
 				// recv rank

From c4cefff27ef4e073e3be7ac058b47b4714dbf7fa Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Wed, 31 Oct 2018 19:03:49 +0100
Subject: [PATCH 090/150] fixed check for illegal rank request

---
 include/ham/net/communicator_tcp.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index 436aba0..ac136ab 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -220,7 +220,7 @@ class communicator : public std::enable_shared_from_this<communicator> {
 			boost::asio::connect(*peers[host_node_].tcp_socket, resolver.resolve(host_address_, host_port_));
 
 			// send requested rank to host
-			HAM_DEBUG( HAM_LOG << "communicator::communicator(): requesting ham-address " << this_node_ << "from host" << std::endl; )
+			HAM_DEBUG( HAM_LOG << "communicator::communicator(): requesting ham-address " << this_node_ << " from host" << std::endl; )
 			boost::asio::write(*peers[host_node_].tcp_socket, boost::asio::buffer((void*)&this_node_, sizeof(this_node_)));
 			// recv rank from host
 			boost::asio::read(*peers[host_node_].tcp_socket, boost::asio::buffer((void*)&this_node_, sizeof(this_node_)));
@@ -254,8 +254,8 @@ class communicator : public std::enable_shared_from_this<communicator> {
 
 			// rearrange sockets and inform targets of resulting rank
 			for (int j = 1; j < nodes_; ++j) {
-				if(req_ranks[j] < -1 || req_ranks[j] > nodes_-1) { // check if rank invalid
-					std::cout << "communicator::communicator(): illegal ham-address requested:" << req_ranks[j] << std::endl;
+				if((req_ranks[j] < -1) || (req_ranks[j] > nodes_-1)) { // check if rank invalid
+					std::cout << "communicator::communicator(): illegal ham-address requested: " << req_ranks[j] << std::endl;
 					exit(-1);
 				}else if(req_ranks[j] == -1) { // skip wildcard ranks, handled later to avoid conflicting ranks with following connects
 					HAM_DEBUG( HAM_LOG << "communicator::communicator(): connection " << j << " requested wildcard ham-address" << std::endl; )

From 63736e06d207809ceb3f0b6ee293ca72d1e5e6e9 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Wed, 31 Oct 2018 19:12:20 +0100
Subject: [PATCH 091/150] fixed check for illegal rank request

---
 include/ham/net/communicator_tcp.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index ac136ab..a6b012f 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -254,7 +254,7 @@ class communicator : public std::enable_shared_from_this<communicator> {
 
 			// rearrange sockets and inform targets of resulting rank
 			for (int j = 1; j < nodes_; ++j) {
-				if((req_ranks[j] < -1) || (req_ranks[j] > nodes_-1)) { // check if rank invalid
+				if((req_ranks[j] < -1) || (req_ranks[j] > (nodes_-1))) { // check if rank invalid
 					std::cout << "communicator::communicator(): illegal ham-address requested: " << req_ranks[j] << std::endl;
 					exit(-1);
 				}else if(req_ranks[j] == -1) { // skip wildcard ranks, handled later to avoid conflicting ranks with following connects

From 4a9cf736ecfd5cd4f55cf1bacef6c13a25c5bcf9 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Wed, 31 Oct 2018 19:18:05 +0100
Subject: [PATCH 092/150] fixed check for illegal rank request

---
 include/ham/net/communicator_tcp.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index a6b012f..892ff22 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -254,7 +254,7 @@ class communicator : public std::enable_shared_from_this<communicator> {
 
 			// rearrange sockets and inform targets of resulting rank
 			for (int j = 1; j < nodes_; ++j) {
-				if((req_ranks[j] < -1) || (req_ranks[j] > (nodes_-1))) { // check if rank invalid
+				if((req_ranks[j] > (nodes_-1))) { // check if rank invalid
 					std::cout << "communicator::communicator(): illegal ham-address requested: " << req_ranks[j] << std::endl;
 					exit(-1);
 				}else if(req_ranks[j] == -1) { // skip wildcard ranks, handled later to avoid conflicting ranks with following connects

From 4afa1568cfb5eea462517312943064b3b582d586 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Wed, 31 Oct 2018 19:28:06 +0100
Subject: [PATCH 093/150] fixed check for illegal rank request (kinda)

---
 include/ham/net/communicator_tcp.hpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index 892ff22..e99568a 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -224,7 +224,7 @@ class communicator : public std::enable_shared_from_this<communicator> {
 			boost::asio::write(*peers[host_node_].tcp_socket, boost::asio::buffer((void*)&this_node_, sizeof(this_node_)));
 			// recv rank from host
 			boost::asio::read(*peers[host_node_].tcp_socket, boost::asio::buffer((void*)&this_node_, sizeof(this_node_)));
-			HAM_DEBUG( HAM_LOG << "communicator::communicator(): received ham-address " << this_node_ << "from host" << std::endl; )
+			HAM_DEBUG( HAM_LOG << "communicator::communicator(): received ham-address " << this_node_ << " from host" << std::endl; )
 		}
 
 		// host accepts tcp connection from targets
@@ -254,7 +254,7 @@ class communicator : public std::enable_shared_from_this<communicator> {
 
 			// rearrange sockets and inform targets of resulting rank
 			for (int j = 1; j < nodes_; ++j) {
-				if((req_ranks[j] > (nodes_-1))) { // check if rank invalid
+				if((req_ranks[j] > (nodes_-1))) { // check if rank invalid // TODO: fix -1 wildcard, currently not possible because req_ranks is unsigned node_t=size_t
 					std::cout << "communicator::communicator(): illegal ham-address requested: " << req_ranks[j] << std::endl;
 					exit(-1);
 				}else if(req_ranks[j] == -1) { // skip wildcard ranks, handled later to avoid conflicting ranks with following connects
@@ -305,9 +305,13 @@ class communicator : public std::enable_shared_from_this<communicator> {
 				}
 			}
 
+            HAM_DEBUG( HAM_LOG << "communicator::communicator(): initializing buffers done" << std::endl; )
+
 			// host runs io_context in separate thread (asynchronous progress thread) for async operations
 			boost::asio::io_service::work work(io_context);
 			std::thread thread([this](){ io_context.run(); });
+
+            HAM_DEBUG( HAM_LOG << "communicator::communicator(): async thread started" << std::endl; )
 		}
 
 

From abd32cf979ed0f3f570223fe0c7e32e804b7b8b8 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Wed, 31 Oct 2018 19:50:05 +0100
Subject: [PATCH 094/150] fixed async thread terminating early (wip)

---
 include/ham/net/communicator_tcp.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index e99568a..528978f 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -310,6 +310,7 @@ class communicator : public std::enable_shared_from_this<communicator> {
 			// host runs io_context in separate thread (asynchronous progress thread) for async operations
 			boost::asio::io_service::work work(io_context);
 			std::thread thread([this](){ io_context.run(); });
+            thread.detach();
 
             HAM_DEBUG( HAM_LOG << "communicator::communicator(): async thread started" << std::endl; )
 		}

From d053cf78c40c40fe032e6cf55da5d81d15124c01 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Fri, 2 Nov 2018 17:26:42 +0100
Subject: [PATCH 095/150] removed shared pointers

---
 include/ham/net/communicator_tcp.hpp | 28 ++++++++++++++++------------
 src/benchmark_ham_offload.cpp        |  6 +++++-
 2 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index 528978f..6588319 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -10,6 +10,7 @@
 #include <cstring> // memcpy
 #include <stdlib.h> // posix_memalign
 #include <thread> // async thread
+// #include <memory> // std::shared_ptr
 
 #include <boost/asio.hpp>
 #include <boost/program_options.hpp>
@@ -60,7 +61,7 @@ class node_descriptor
 	friend class net::communicator;
 };
 
-class communicator : public std::enable_shared_from_this<communicator> {
+class communicator { // : public std::enable_shared_from_this<communicator>
 public:
 	// externally used interface of request must be shared across all communicator-implementations
 	class request {
@@ -361,9 +362,9 @@ class communicator : public std::enable_shared_from_this<communicator> {
 		memcpy(msg_buffer, msg, size);
 
 		// tcp write
-		auto self(shared_from_this());
+		// auto self(shared_from_this());
 		boost::asio::async_write(*peers[req.target_node].tcp_socket, boost::asio::buffer(msg_buffer, size),
-								[this, self, &req](boost::system::error_code ec, size_t length) {
+								[&req](boost::system::error_code ec, size_t length) {
 									req.sent_ = true;
 								}
 		);
@@ -380,6 +381,7 @@ class communicator : public std::enable_shared_from_this<communicator> {
         return static_cast<void*>(&buffer);
 	}
 
+    // target only -> sync
     // send result through communicator
     // only to be used by request.send_result()
     template<class T>
@@ -388,13 +390,14 @@ class communicator : public std::enable_shared_from_this<communicator> {
         boost::asio::write(*peers[target_node].tcp_socket, boost::asio::buffer((void*)message, size));
     }
 
+    // host only -> async
 	// trigger receiving the result of an active message on the host
 	void recv_result(request_reference_type req)
 	{
 		// tcp receive
-        auto self(shared_from_this());
+        // auto self(shared_from_this());
         boost::asio::async_read(*peers[req.target_node].tcp_socket, boost::asio::buffer(static_cast<void*>(&peers[req.target_node].msg_buffers[req.recv_buffer_index]), constants::MSG_SIZE),
-				[this, self, &req](boost::system::error_code ec, size_t length) {
+				[&req](boost::system::error_code ec, size_t length) {
 					req.received_ = true;
 				}
 		);
@@ -402,29 +405,30 @@ class communicator : public std::enable_shared_from_this<communicator> {
 		return;
 	}
 
+    // target only, host never uses sync variant
 	template<typename T>
 	void send_data(T* local_source, buffer_ptr<T> remote_dest, size_t size)
 	{
 		// tcp send
-
         boost::asio::write(*peers[remote_dest.node()].tcp_socket, boost::asio::buffer((void*)local_source, size * sizeof(T)));
+
 		// MPI_Send((void*)local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), constants::DATA_TAG, MPI_COMM_WORLD);
 	}
 
-	// to be used by the host
+	// host only
 	template<typename T>
 	void send_data_async(request_reference_type req, T* local_source, buffer_ptr<T> remote_dest, size_t size)
 	{
-		auto self(shared_from_this());
+		// auto self(shared_from_this());
 		boost::asio::async_write(*peers[remote_dest.node()].tcp_socket, boost::asio::buffer((void*)local_source, size*sizeof(T)),
-								 [this, self, &req](boost::system::error_code ec, size_t length) {
+								 [&req](boost::system::error_code ec, size_t length) {
 									 req.sent_ = true;
 								 }
 		);
 		// MPI_Isend((void*)local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), constants::DATA_TAG, MPI_COMM_WORLD, &req.next_mpi_request());
 	}
 
-
+    // target only
 	template<typename T>
 	void recv_data(buffer_ptr<T> remote_source, T* local_dest, size_t size)
 	{
@@ -437,9 +441,9 @@ class communicator : public std::enable_shared_from_this<communicator> {
 	template<typename T>
 	void recv_data_async(request_reference_type req, buffer_ptr<T> remote_source, T* local_dest, size_t size)
 	{
-        auto self(shared_from_this());
+        // auto self(shared_from_this());
 		boost::asio::async_read(*peers[remote_source.node()].tcp_socket, boost::asio::buffer(static_cast<void*>(local_dest), size*sizeof(T)),
-								[this, self, &req](boost::system::error_code ec, size_t length) {
+								[&req](boost::system::error_code ec, size_t length) {
 									req.received_ = true;
 								}
 		);
diff --git a/src/benchmark_ham_offload.cpp b/src/benchmark_ham_offload.cpp
index 3e55ec7..3b56e3a 100644
--- a/src/benchmark_ham_offload.cpp
+++ b/src/benchmark_ham_offload.cpp
@@ -165,7 +165,11 @@ int main(int argc, char * argv[])
 	#else
 		std::cout << "# COMM_MPI_RMA_DYNAMIC         disabled" << std::endl;
 	#endif
-
+	#ifdef HAM_COMM_TCP
+		std::cout << "# COMM_TCP         enabled" << std::endl;
+	#else
+		std::cout << "# COMM_TCP         disabled" << std::endl;
+#endif
 #ifdef HAM_COMM_SCIF
 		std::cout << "# HAM_COMM_SCIF                enabled" << std::endl;
 	#ifdef HAM_SCIF_RMA_CPU

From 2721d16e83495ce7fd587da7da6c0b6333fef320 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Fri, 2 Nov 2018 17:36:37 +0100
Subject: [PATCH 096/150] changed std::thread invocation

---
 include/ham/net/communicator_tcp.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index 6588319..4e2377d 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -310,7 +310,7 @@ class communicator { // : public std::enable_shared_from_this<communicator>
 
 			// host runs io_context in separate thread (asynchronous progress thread) for async operations
 			boost::asio::io_service::work work(io_context);
-			std::thread thread([this](){ io_context.run(); });
+			std::thread thread([this, &io_context](){ io_context.run(); });
             thread.detach();
 
             HAM_DEBUG( HAM_LOG << "communicator::communicator(): async thread started" << std::endl; )

From 2581f11d0077e6a2f10ae48f35162731419d89d4 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Fri, 2 Nov 2018 17:55:47 +0100
Subject: [PATCH 097/150] debugging async ops not completing

---
 include/ham/net/communicator_tcp.hpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index 4e2377d..4548f9c 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -310,7 +310,12 @@ class communicator { // : public std::enable_shared_from_this<communicator>
 
 			// host runs io_context in separate thread (asynchronous progress thread) for async operations
 			boost::asio::io_service::work work(io_context);
-			std::thread thread([this, &io_context](){ io_context.run(); });
+			std::thread thread([this](){
+                HAM_DEBUG( HAM_LOG << "ASYNC THREAD: Heyooo, I live." << std::endl; )
+                io_context.run();
+                HAM_DEBUG( HAM_LOG << "ASYNC THREAD: Oh noes, I'm dead!" << std::endl; )
+                }
+            );
             thread.detach();
 
             HAM_DEBUG( HAM_LOG << "communicator::communicator(): async thread started" << std::endl; )

From 4d88a32032bc947f522da71e9d3b3d4ef9f4cc3e Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Fri, 2 Nov 2018 17:59:57 +0100
Subject: [PATCH 098/150] debugging async ops not completing

---
 include/ham/net/communicator_tcp.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index 4548f9c..cf109a7 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -309,9 +309,10 @@ class communicator { // : public std::enable_shared_from_this<communicator>
             HAM_DEBUG( HAM_LOG << "communicator::communicator(): initializing buffers done" << std::endl; )
 
 			// host runs io_context in separate thread (asynchronous progress thread) for async operations
-			boost::asio::io_service::work work(io_context);
+
 			std::thread thread([this](){
                 HAM_DEBUG( HAM_LOG << "ASYNC THREAD: Heyooo, I live." << std::endl; )
+                boost::asio::io_service::work work(io_context);
                 io_context.run();
                 HAM_DEBUG( HAM_LOG << "ASYNC THREAD: Oh noes, I'm dead!" << std::endl; )
                 }

From 08b774c31f5af321ae502560ce1f562b62a2d563 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Fri, 2 Nov 2018 18:07:25 +0100
Subject: [PATCH 099/150] debugging async ops not completing

---
 include/ham/net/communicator_tcp.hpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index cf109a7..8099735 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -370,8 +370,9 @@ class communicator { // : public std::enable_shared_from_this<communicator>
 		// tcp write
 		// auto self(shared_from_this());
 		boost::asio::async_write(*peers[req.target_node].tcp_socket, boost::asio::buffer(msg_buffer, size),
-								[&req](boost::system::error_code ec, size_t length) {
+								[this, &req](boost::system::error_code ec, size_t length) {
 									req.sent_ = true;
+                                    HAM_DEBUG( HAM_LOG << "THREAD: Async completion handler executed" << std::endl; )
 								}
 		);
 		// MPI_Isend(msg_buffer, size, MPI_BYTE, req.target_node, constants::DEFAULT_TAG, MPI_COMM_WORLD, &req.next_mpi_request());
@@ -403,7 +404,7 @@ class communicator { // : public std::enable_shared_from_this<communicator>
 		// tcp receive
         // auto self(shared_from_this());
         boost::asio::async_read(*peers[req.target_node].tcp_socket, boost::asio::buffer(static_cast<void*>(&peers[req.target_node].msg_buffers[req.recv_buffer_index]), constants::MSG_SIZE),
-				[&req](boost::system::error_code ec, size_t length) {
+				[this, &req](boost::system::error_code ec, size_t length) {
 					req.received_ = true;
 				}
 		);

From fd02c36483a1fbe2ced17d138543ad2242ae51df Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Fri, 2 Nov 2018 18:10:59 +0100
Subject: [PATCH 100/150] debugging async ops not completing

---
 include/ham/net/communicator_tcp.hpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index 8099735..9b25336 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -369,10 +369,12 @@ class communicator { // : public std::enable_shared_from_this<communicator>
 
 		// tcp write
 		// auto self(shared_from_this());
+        HAM_DEBUG( HAM_LOG << "communicator::send_msg(): sending msg to: " << req.target_node << std::endl; )
+
 		boost::asio::async_write(*peers[req.target_node].tcp_socket, boost::asio::buffer(msg_buffer, size),
 								[this, &req](boost::system::error_code ec, size_t length) {
 									req.sent_ = true;
-                                    HAM_DEBUG( HAM_LOG << "THREAD: Async completion handler executed" << std::endl; )
+                                    HAM_DEBUG( HAM_LOG << "THREAD: Async completion handler executed, send_msg() completed" << std::endl; )
 								}
 		);
 		// MPI_Isend(msg_buffer, size, MPI_BYTE, req.target_node, constants::DEFAULT_TAG, MPI_COMM_WORLD, &req.next_mpi_request());

From 95095eab73bfe2e1371aa7512b98999b356780cd Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Fri, 2 Nov 2018 18:13:45 +0100
Subject: [PATCH 101/150] debugging async ops not completing

---
 include/ham/net/communicator_tcp.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index 9b25336..bb79915 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -386,6 +386,7 @@ class communicator { // : public std::enable_shared_from_this<communicator>
 	{
 		static msg_buffer buffer; // NOTE !
 		// MPI_Recv(&buffer, size, MPI_BYTE, host_node_, constants::DEFAULT_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+        HAM_DEBUG( HAM_LOG << "communicator::recv_msg_host(): node " << this_node_ << " awaiting AM from host"  << std::endl; )
 		boost::asio::read(*peers[host_node_].tcp_socket, boost::asio::buffer(&buffer, size));
         return static_cast<void*>(&buffer);
 	}

From cd4f43148fd0d0ec4b2d405de305249edafca233 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Fri, 2 Nov 2018 18:26:20 +0100
Subject: [PATCH 102/150] debugging async ops not completing

---
 include/ham/net/communicator_tcp.hpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index bb79915..d553695 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -406,9 +406,12 @@ class communicator { // : public std::enable_shared_from_this<communicator>
 	{
 		// tcp receive
         // auto self(shared_from_this());
+        HAM_DEBUG( HAM_LOG << "communicator::recv_result(): receiving msg from: " << req.target_node << std::endl; )
+
         boost::asio::async_read(*peers[req.target_node].tcp_socket, boost::asio::buffer(static_cast<void*>(&peers[req.target_node].msg_buffers[req.recv_buffer_index]), constants::MSG_SIZE),
 				[this, &req](boost::system::error_code ec, size_t length) {
 					req.received_ = true;
+                    HAM_DEBUG( HAM_LOG << "THREAD: Async completion handler executed, recv_result() completed " << req.target_node << std::endl; )
 				}
 		);
 		// MPI_Irecv(static_cast<void*>(&peers[req.target_node].msg_buffers[req.recv_buffer_index]), constants::MSG_SIZE, MPI_BYTE, req.target_node, constants::RESULT_TAG, MPI_COMM_WORLD, &req.next_mpi_request());

From 3996db0975f1e1f0e2927a121b57fcceae724104 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Fri, 2 Nov 2018 18:34:34 +0100
Subject: [PATCH 103/150] debugging async ops not completing

---
 include/ham/net/communicator_tcp.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index d553695..5fc749f 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -374,7 +374,7 @@ class communicator { // : public std::enable_shared_from_this<communicator>
 		boost::asio::async_write(*peers[req.target_node].tcp_socket, boost::asio::buffer(msg_buffer, size),
 								[this, &req](boost::system::error_code ec, size_t length) {
 									req.sent_ = true;
-                                    HAM_DEBUG( HAM_LOG << "THREAD: Async completion handler executed, send_msg() completed" << std::endl; )
+                                    HAM_DEBUG( HAM_LOG << "THREAD: Async completion handler executed, send_msg() to " << req.target_node << " completed" << std::endl; )
 								}
 		);
 		// MPI_Isend(msg_buffer, size, MPI_BYTE, req.target_node, constants::DEFAULT_TAG, MPI_COMM_WORLD, &req.next_mpi_request());

From 354e1d2b712457889532f7c65f1e95f5fec11eed Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Fri, 2 Nov 2018 18:41:37 +0100
Subject: [PATCH 104/150] debugging async ops not completing

---
 include/ham/net/communicator_tcp.hpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index 5fc749f..ff53282 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -373,8 +373,13 @@ class communicator { // : public std::enable_shared_from_this<communicator>
 
 		boost::asio::async_write(*peers[req.target_node].tcp_socket, boost::asio::buffer(msg_buffer, size),
 								[this, &req](boost::system::error_code ec, size_t length) {
-									req.sent_ = true;
-                                    HAM_DEBUG( HAM_LOG << "THREAD: Async completion handler executed, send_msg() to " << req.target_node << " completed" << std::endl; )
+                                    if (!ec)
+                                    {
+                                        req.sent_ = true;
+                                        HAM_DEBUG( HAM_LOG << "THREAD: Async completion handler executed, send_msg() to " << req.target_node << " completed" << std::endl; )
+                                    } else {
+                                        HAM_DEBUG( HAM_LOG << "THREAD: Async completion handler executed, failed to send_msg() to " << req.target_node << " Error: " << ec.message() << std::endl; )
+                                    }
 								}
 		);
 		// MPI_Isend(msg_buffer, size, MPI_BYTE, req.target_node, constants::DEFAULT_TAG, MPI_COMM_WORLD, &req.next_mpi_request());

From aa18deb421de331110302364471e8227a85e6ff8 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Fri, 2 Nov 2018 18:44:38 +0100
Subject: [PATCH 105/150] debugging async ops not completing

---
 include/ham/net/communicator_tcp.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index ff53282..b0536cb 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -376,7 +376,7 @@ class communicator { // : public std::enable_shared_from_this<communicator>
                                     if (!ec)
                                     {
                                         req.sent_ = true;
-                                        HAM_DEBUG( HAM_LOG << "THREAD: Async completion handler executed, send_msg() to " << req.target_node << " completed" << std::endl; )
+                                        HAM_DEBUG( HAM_LOG << "THREAD: Async completion handler executed, send_msg() to " << req.target_node << " completed. Wrote " << length << " Bytes." << std::endl; )
                                     } else {
                                         HAM_DEBUG( HAM_LOG << "THREAD: Async completion handler executed, failed to send_msg() to " << req.target_node << " Error: " << ec.message() << std::endl; )
                                     }

From 7244c92e7de8e0dd69550f64016d2b0a640d5269 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Fri, 2 Nov 2018 18:53:33 +0100
Subject: [PATCH 106/150] debugging async ops not completing

---
 include/ham/net/communicator_tcp.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index b0536cb..1171fbd 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -392,7 +392,7 @@ class communicator { // : public std::enable_shared_from_this<communicator>
 		static msg_buffer buffer; // NOTE !
 		// MPI_Recv(&buffer, size, MPI_BYTE, host_node_, constants::DEFAULT_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
         HAM_DEBUG( HAM_LOG << "communicator::recv_msg_host(): node " << this_node_ << " awaiting AM from host"  << std::endl; )
-		boost::asio::read(*peers[host_node_].tcp_socket, boost::asio::buffer(&buffer, size));
+		boost::asio::read(*peers[host_node_].tcp_socket, boost::asio::buffer(&buffer, 72 /*size*/));
         return static_cast<void*>(&buffer);
 	}
 

From 922e9daf4ddecdf5a5d965941cdc5fd8294aa76d Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Fri, 2 Nov 2018 19:34:34 +0100
Subject: [PATCH 107/150] adding delimiter to AM transfers

---
 include/ham/misc/constants.hpp       | 1 +
 include/ham/net/communicator_tcp.hpp | 8 +++++---
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/include/ham/misc/constants.hpp b/include/ham/misc/constants.hpp
index b37c690..fba5c4c 100644
--- a/include/ham/misc/constants.hpp
+++ b/include/ham/misc/constants.hpp
@@ -19,6 +19,7 @@ enum net {
 	MSG_BUFFERS = 256,
 	DATA_PUT_CODE = 1,
 	DATA_GET_CODE = 2,
+	TCP_DELIM = "\r\n\r\n",
 };
 
 enum arch {
diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index 1171fbd..aabd741 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -142,7 +142,7 @@ class communicator { // : public std::enable_shared_from_this<communicator>
 	typedef request& request_reference_type;
 	typedef const request& request_const_reference_type;
 
-	communicator(int argc, char* argv[]) : node_desc_dummy()
+	communicator(int argc, char* argv[]) : node_desc_dummy(), delim(constants::TCP_DELIM)
 	{
 		HAM_DEBUG( HAM_LOG << "communicator::communicator(): initialising configuration" << std::endl; )
 
@@ -366,12 +366,13 @@ class communicator { // : public std::enable_shared_from_this<communicator>
 		// copy message from caller into transfer buffer
 		void* msg_buffer = static_cast<void*>(&peers[req.target_node].msg_buffers[req.send_buffer_index]);
 		memcpy(msg_buffer, msg, size);
+        memcpy(msg_buffer+size, delim.c_str(), delim.size()); // add tcp delimiter to message is defined in
 
 		// tcp write
 		// auto self(shared_from_this());
         HAM_DEBUG( HAM_LOG << "communicator::send_msg(): sending msg to: " << req.target_node << std::endl; )
 
-		boost::asio::async_write(*peers[req.target_node].tcp_socket, boost::asio::buffer(msg_buffer, size),
+		boost::asio::async_write(*peers[req.target_node].tcp_socket, boost::asio::buffer(msg_buffer, size+delim.size()),
 								[this, &req](boost::system::error_code ec, size_t length) {
                                     if (!ec)
                                     {
@@ -392,7 +393,7 @@ class communicator { // : public std::enable_shared_from_this<communicator>
 		static msg_buffer buffer; // NOTE !
 		// MPI_Recv(&buffer, size, MPI_BYTE, host_node_, constants::DEFAULT_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
         HAM_DEBUG( HAM_LOG << "communicator::recv_msg_host(): node " << this_node_ << " awaiting AM from host"  << std::endl; )
-		boost::asio::read(*peers[host_node_].tcp_socket, boost::asio::buffer(&buffer, 72 /*size*/));
+		boost::asio::read_until(*peers[host_node_].tcp_socket, &buffer, delim);
         return static_cast<void*>(&buffer);
 	}
 
@@ -507,6 +508,7 @@ class communicator { // : public std::enable_shared_from_this<communicator>
 	std::string host_port_;
     node_descriptor node_desc_dummy;
 	boost::asio::io_service io_context;
+    const std::string delim;
 		
 	struct tcp_peer {
 		buffer_ptr<msg_buffer> msg_buffers; // buffers used for MPI_ISend and IRecv by the sender

From 2878033412ff77f2320122025c21cbc8737a14c0 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Fri, 2 Nov 2018 19:38:30 +0100
Subject: [PATCH 108/150] adding delimiter to AM transfers

---
 include/ham/misc/constants.hpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/ham/misc/constants.hpp b/include/ham/misc/constants.hpp
index fba5c4c..9f9fa70 100644
--- a/include/ham/misc/constants.hpp
+++ b/include/ham/misc/constants.hpp
@@ -10,6 +10,9 @@
 #ifndef HAM_MESSAGE_SIZE
 #define HAM_MESSAGE_SIZE 4096
 #endif
+#ifndef HAM_TCP_DELIM
+#define HAM_TCP_DELIM "\r\n\r\n"
+#endif
 
 namespace ham {
 namespace constants {
@@ -19,7 +22,6 @@ enum net {
 	MSG_BUFFERS = 256,
 	DATA_PUT_CODE = 1,
 	DATA_GET_CODE = 2,
-	TCP_DELIM = "\r\n\r\n",
 };
 
 enum arch {

From fa33ca55bbd78419c6b0fc2ff9e6c20cdc8aa99e Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Fri, 2 Nov 2018 19:39:08 +0100
Subject: [PATCH 109/150] adding delimiter to AM transfers

---
 include/ham/net/communicator_tcp.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index aabd741..4e24b49 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -142,7 +142,7 @@ class communicator { // : public std::enable_shared_from_this<communicator>
 	typedef request& request_reference_type;
 	typedef const request& request_const_reference_type;
 
-	communicator(int argc, char* argv[]) : node_desc_dummy(), delim(constants::TCP_DELIM)
+	communicator(int argc, char* argv[]) : node_desc_dummy(), delim(HAM_TCP_DELIM)
 	{
 		HAM_DEBUG( HAM_LOG << "communicator::communicator(): initialising configuration" << std::endl; )
 

From d520e2cf4fc1717d1af658a8b15ebfcff8c7c465 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Fri, 2 Nov 2018 19:41:44 +0100
Subject: [PATCH 110/150] adding delimiter to AM transfers

---
 include/ham/net/communicator_tcp.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index 4e24b49..1bba878 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -393,7 +393,7 @@ class communicator { // : public std::enable_shared_from_this<communicator>
 		static msg_buffer buffer; // NOTE !
 		// MPI_Recv(&buffer, size, MPI_BYTE, host_node_, constants::DEFAULT_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
         HAM_DEBUG( HAM_LOG << "communicator::recv_msg_host(): node " << this_node_ << " awaiting AM from host"  << std::endl; )
-		boost::asio::read_until(*peers[host_node_].tcp_socket, &buffer, delim);
+		boost::asio::read_until(*peers[host_node_].tcp_socket, buffer, delim);
         return static_cast<void*>(&buffer);
 	}
 

From 5fdba9fdc0392d01af0e8c86a888cb2fca32babd Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Fri, 2 Nov 2018 19:48:28 +0100
Subject: [PATCH 111/150] removed delimiters, just send full MSG_SIZE

---
 include/ham/misc/constants.hpp       |  3 ---
 include/ham/net/communicator_tcp.hpp | 12 +++++-------
 2 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/include/ham/misc/constants.hpp b/include/ham/misc/constants.hpp
index 9f9fa70..b37c690 100644
--- a/include/ham/misc/constants.hpp
+++ b/include/ham/misc/constants.hpp
@@ -10,9 +10,6 @@
 #ifndef HAM_MESSAGE_SIZE
 #define HAM_MESSAGE_SIZE 4096
 #endif
-#ifndef HAM_TCP_DELIM
-#define HAM_TCP_DELIM "\r\n\r\n"
-#endif
 
 namespace ham {
 namespace constants {
diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index 1bba878..87baee7 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -142,7 +142,7 @@ class communicator { // : public std::enable_shared_from_this<communicator>
 	typedef request& request_reference_type;
 	typedef const request& request_const_reference_type;
 
-	communicator(int argc, char* argv[]) : node_desc_dummy(), delim(HAM_TCP_DELIM)
+	communicator(int argc, char* argv[]) : node_desc_dummy()
 	{
 		HAM_DEBUG( HAM_LOG << "communicator::communicator(): initialising configuration" << std::endl; )
 
@@ -366,13 +366,12 @@ class communicator { // : public std::enable_shared_from_this<communicator>
 		// copy message from caller into transfer buffer
 		void* msg_buffer = static_cast<void*>(&peers[req.target_node].msg_buffers[req.send_buffer_index]);
 		memcpy(msg_buffer, msg, size);
-        memcpy(msg_buffer+size, delim.c_str(), delim.size()); // add tcp delimiter to message is defined in
 
 		// tcp write
 		// auto self(shared_from_this());
         HAM_DEBUG( HAM_LOG << "communicator::send_msg(): sending msg to: " << req.target_node << std::endl; )
-
-		boost::asio::async_write(*peers[req.target_node].tcp_socket, boost::asio::buffer(msg_buffer, size+delim.size()),
+        //always write full message size TODO(improvement): improve with delimiter and read_until @ target
+		boost::asio::async_write(*peers[req.target_node].tcp_socket, boost::asio::buffer(msg_buffer, constants::MSG_SIZE),
 								[this, &req](boost::system::error_code ec, size_t length) {
                                     if (!ec)
                                     {
@@ -393,7 +392,7 @@ class communicator { // : public std::enable_shared_from_this<communicator>
 		static msg_buffer buffer; // NOTE !
 		// MPI_Recv(&buffer, size, MPI_BYTE, host_node_, constants::DEFAULT_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
         HAM_DEBUG( HAM_LOG << "communicator::recv_msg_host(): node " << this_node_ << " awaiting AM from host"  << std::endl; )
-		boost::asio::read_until(*peers[host_node_].tcp_socket, buffer, delim);
+		boost::asio::read(*peers[host_node_].tcp_socket, boost::asio::buffer(&buffer, size)); // will always read full MSG_SIZE
         return static_cast<void*>(&buffer);
 	}
 
@@ -508,8 +507,7 @@ class communicator { // : public std::enable_shared_from_this<communicator>
 	std::string host_port_;
     node_descriptor node_desc_dummy;
 	boost::asio::io_service io_context;
-    const std::string delim;
-		
+
 	struct tcp_peer {
 		buffer_ptr<msg_buffer> msg_buffers; // buffers used for MPI_ISend and IRecv by the sender
 

From 77173ff6ed7cbff818bbcbdf835075218d96f2e2 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Fri, 2 Nov 2018 19:52:08 +0100
Subject: [PATCH 112/150] added error handling on recv_result

---
 include/ham/net/communicator_tcp.hpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index 87baee7..9f5b4d6 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -415,8 +415,13 @@ class communicator { // : public std::enable_shared_from_this<communicator>
 
         boost::asio::async_read(*peers[req.target_node].tcp_socket, boost::asio::buffer(static_cast<void*>(&peers[req.target_node].msg_buffers[req.recv_buffer_index]), constants::MSG_SIZE),
 				[this, &req](boost::system::error_code ec, size_t length) {
-					req.received_ = true;
-                    HAM_DEBUG( HAM_LOG << "THREAD: Async completion handler executed, recv_result() completed " << req.target_node << std::endl; )
+                    if (!ec)
+                    {
+                        req.received_ = true;
+                        HAM_DEBUG( HAM_LOG << "THREAD: Async completion handler executed, recv_result() completed " << req.target_node << std::endl; )
+                    } else {
+                        HAM_DEBUG( HAM_LOG << "THREAD: Async completion handler executed, failed to recv_result() from " << req.target_node << " Error: " << ec.message() << std::endl; )
+                    }
 				}
 		);
 		// MPI_Irecv(static_cast<void*>(&peers[req.target_node].msg_buffers[req.recv_buffer_index]), constants::MSG_SIZE, MPI_BYTE, req.target_node, constants::RESULT_TAG, MPI_COMM_WORLD, &req.next_mpi_request());

From 86e8bb7d7ab888cfecbe015713dfb1829e27f1ef Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Fri, 2 Nov 2018 19:57:45 +0100
Subject: [PATCH 113/150] added error handling on recv_result

---
 include/ham/net/communicator_tcp.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index 9f5b4d6..4a1ccf6 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -401,7 +401,7 @@ class communicator { // : public std::enable_shared_from_this<communicator>
     // only to be used by request.send_result()
     template<class T>
     void send_result(node_t target_node, T* message, size_t size) {
-
+        HAM_DEBUG( HAM_LOG << "communicator::send_result(): sending result to host"  << std::endl; )
         boost::asio::write(*peers[target_node].tcp_socket, boost::asio::buffer((void*)message, size));
     }
 

From d16d33629eaa8a5da38c3f33fcc6bed78eaafde3 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Fri, 2 Nov 2018 20:01:29 +0100
Subject: [PATCH 114/150] added debug output for send_result()

---
 include/ham/net/communicator_tcp.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index 4a1ccf6..bfc4d88 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -401,7 +401,7 @@ class communicator { // : public std::enable_shared_from_this<communicator>
     // only to be used by request.send_result()
     template<class T>
     void send_result(node_t target_node, T* message, size_t size) {
-        HAM_DEBUG( HAM_LOG << "communicator::send_result(): sending result to host"  << std::endl; )
+        HAM_DEBUG( HAM_LOG << "communicator::send_result(): node " << target_node << " sending result to host"  << std::endl; )
         boost::asio::write(*peers[target_node].tcp_socket, boost::asio::buffer((void*)message, size));
     }
 

From 268e817e7228f45e6a73e12fef0e8a86d9c8446e Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Fri, 2 Nov 2018 20:03:05 +0100
Subject: [PATCH 115/150] fixed send_result() target node

---
 include/ham/net/communicator_tcp.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index bfc4d88..482e3b7 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -402,7 +402,7 @@ class communicator { // : public std::enable_shared_from_this<communicator>
     template<class T>
     void send_result(node_t target_node, T* message, size_t size) {
         HAM_DEBUG( HAM_LOG << "communicator::send_result(): node " << target_node << " sending result to host"  << std::endl; )
-        boost::asio::write(*peers[target_node].tcp_socket, boost::asio::buffer((void*)message, size));
+        boost::asio::write(*peers[host_node_].tcp_socket, boost::asio::buffer((void*)message, size));
     }
 
     // host only -> async

From 617252e061b263f9f7e31ef48c4ef30ef20880e6 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Fri, 2 Nov 2018 20:04:11 +0100
Subject: [PATCH 116/150] fixed send_result() target node - properly

---
 include/ham/net/communicator_tcp.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index 482e3b7..bfd1cd7 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -104,7 +104,7 @@ class communicator { // : public std::enable_shared_from_this<communicator>
 			// TODO(improvement, low priority): better go through communicator, such that no MPI calls are anywhere else
 			// MPI_Send(result_msg, size, MPI_BYTE, source_node, constants::RESULT_TAG, MPI_COMM_WORLD);
 
-			communicator::instance().send_result(target_node, result_msg, size);
+			communicator::instance().send_result(source_node, result_msg, size);
             // don't need size * sizeof(T) because req.send_result is called as send_result((void*)&a, sizeof(a)) in offload_msg.hpp
 		}
 
@@ -402,7 +402,7 @@ class communicator { // : public std::enable_shared_from_this<communicator>
     template<class T>
     void send_result(node_t target_node, T* message, size_t size) {
         HAM_DEBUG( HAM_LOG << "communicator::send_result(): node " << target_node << " sending result to host"  << std::endl; )
-        boost::asio::write(*peers[host_node_].tcp_socket, boost::asio::buffer((void*)message, size));
+        boost::asio::write(*peers[target_node].tcp_socket, boost::asio::buffer((void*)message, size));
     }
 
     // host only -> async

From 9b935a631e48bffcccc32c93c633a76e8c3751b6 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Fri, 2 Nov 2018 20:19:43 +0100
Subject: [PATCH 117/150] sending full MSG_SIZE for results

---
 include/ham/net/communicator_tcp.hpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index bfd1cd7..de11114 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -104,6 +104,8 @@ class communicator { // : public std::enable_shared_from_this<communicator>
 			// TODO(improvement, low priority): better go through communicator, such that no MPI calls are anywhere else
 			// MPI_Send(result_msg, size, MPI_BYTE, source_node, constants::RESULT_TAG, MPI_COMM_WORLD);
 
+
+
 			communicator::instance().send_result(source_node, result_msg, size);
             // don't need size * sizeof(T) because req.send_result is called as send_result((void*)&a, sizeof(a)) in offload_msg.hpp
 		}
@@ -402,7 +404,10 @@ class communicator { // : public std::enable_shared_from_this<communicator>
     template<class T>
     void send_result(node_t target_node, T* message, size_t size) {
         HAM_DEBUG( HAM_LOG << "communicator::send_result(): node " << target_node << " sending result to host"  << std::endl; )
-        boost::asio::write(*peers[target_node].tcp_socket, boost::asio::buffer((void*)message, size));
+        void* ptr; // ugly stuff to wrap result into MSG_SIZE buffer TODO(improvement): change to transfering only actual result size by using delimiter and read_until in recv_result()
+        posix_memalign((void**)&ptr, constants::CACHE_LINE_SIZE, constants::MSG_SIZE);
+        memcpy(ptr, message, size);
+        boost::asio::write(*peers[target_node].tcp_socket, boost::asio::buffer(ptr, constants::MSG_SIZE));
     }
 
     // host only -> async

From 126699ae6361225159efaa940d87ec1000b6160a Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Fri, 2 Nov 2018 20:51:58 +0100
Subject: [PATCH 118/150] fixed tcp to use same copy protocol as MPI

---
 include/ham/offload/offload.hpp | 2 +-
 src/benchmark_ham_offload.cpp   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/ham/offload/offload.hpp b/include/ham/offload/offload.hpp
index 58e7e19..334ec45 100644
--- a/include/ham/offload/offload.hpp
+++ b/include/ham/offload/offload.hpp
@@ -339,7 +339,7 @@ void copy_sync(buffer_ptr<T> source, buffer_ptr<T> dest, size_t n)
 // fix 1st arg:
 //	comm.send_data(src_node, local_source, remote_dest, n);
 //	static_assert(false, "copy is not implemented yet for the SCIF back-end");
-#elif defined HAM_COMM_MPI
+#elif defined(HAM_COMM_MPI) || defined(HAM_COMM_TCP)
 	// send corresponding write and read messages to the sender and the receiver
 
 	// issues a send operation on the source node, that sends the memory at source to the destination node
diff --git a/src/benchmark_ham_offload.cpp b/src/benchmark_ham_offload.cpp
index 3b56e3a..dabe62d 100644
--- a/src/benchmark_ham_offload.cpp
+++ b/src/benchmark_ham_offload.cpp
@@ -166,9 +166,9 @@ int main(int argc, char * argv[])
 		std::cout << "# COMM_MPI_RMA_DYNAMIC         disabled" << std::endl;
 	#endif
 	#ifdef HAM_COMM_TCP
-		std::cout << "# COMM_TCP         enabled" << std::endl;
+		std::cout << "# COMM_TCP                     enabled" << std::endl;
 	#else
-		std::cout << "# COMM_TCP         disabled" << std::endl;
+		std::cout << "# COMM_TCP                     disabled" << std::endl;
 #endif
 #ifdef HAM_COMM_SCIF
 		std::cout << "# HAM_COMM_SCIF                enabled" << std::endl;

From ff4fd42ee40ba2b511001d7668ab9fc628e851d9 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Fri, 2 Nov 2018 21:04:02 +0100
Subject: [PATCH 119/150] unfixed tcp copy to not implemented... as originally
 intended

---
 include/ham/offload/offload.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/ham/offload/offload.hpp b/include/ham/offload/offload.hpp
index 334ec45..7632a1b 100644
--- a/include/ham/offload/offload.hpp
+++ b/include/ham/offload/offload.hpp
@@ -329,17 +329,17 @@ future<void> copy(buffer_ptr<T> source, buffer_ptr<T> dest, size_t n)
 }
 #endif
 
-#ifndef HAM_COMM_ONE_SIDED // TODO(feature, high priority): implement
+#if !defined(HAM_COMM_ONE_SIDED) || !defined(HAM_COMM_TCP)// TODO(feature, high priority): implement
         template<typename T>
 void copy_sync(buffer_ptr<T> source, buffer_ptr<T> dest, size_t n)
 {
 	net::communicator& comm = runtime::instance().communicator();
-#ifdef HAM_COMM_ONE_SIDED
+#if  defined(HAM_COMM_ONE_SIDED) || defined(HAM_COMM_TCP)
 // TODO(feature, high priority): implement
 // fix 1st arg:
 //	comm.send_data(src_node, local_source, remote_dest, n);
 //	static_assert(false, "copy is not implemented yet for the SCIF back-end");
-#elif defined(HAM_COMM_MPI) || defined(HAM_COMM_TCP)
+#elif defined HAM_COMM_MPI
 	// send corresponding write and read messages to the sender and the receiver
 
 	// issues a send operation on the source node, that sends the memory at source to the destination node

From 462bbbf7666bec93e83cef86be2aea774d02a12b Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Tue, 6 Nov 2018 21:01:00 +0100
Subject: [PATCH 120/150] made sent/received completion flags volatile

---
 CMakeLists.txt                       | 2 +-
 include/ham/net/communicator_tcp.hpp | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5c48af8..30f3dbd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -41,7 +41,7 @@ else ()
 endif ()
 
 # tell the compiler to be strict
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -hstd=c++11")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DHAM_DEBUG_ON")
 
 add_subdirectory(thirdparty/bmt ${CMAKE_CURRENT_BINARY_DIR}/build.noma_bmt)
diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index de11114..33332fd 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -126,8 +126,8 @@ class communicator { // : public std::enable_shared_from_this<communicator>
         node_t target_node;
 		node_t source_node;
 		bool valid_;
-		bool received_; // used for the async receive handler to set to true, checked for completion
-		bool sent_; // used for the async send handler to set to true... unused, but the handler likes to do something
+		volatile bool received_; // used for the async receive handler to set to true, checked for completion
+		volatile bool sent_; // used for the async send handler to set to true... unused, but the handler likes to do something
 
 		// only needed by the sender
 		enum { NUM_REQUESTS = 3 };
@@ -403,7 +403,7 @@ class communicator { // : public std::enable_shared_from_this<communicator>
     // only to be used by request.send_result()
     template<class T>
     void send_result(node_t target_node, T* message, size_t size) {
-        HAM_DEBUG( HAM_LOG << "communicator::send_result(): node " << target_node << " sending result to host"  << std::endl; )
+        HAM_DEBUG( HAM_LOG << "communicator::send_result(): node " << this_node_ << " sending result to node: " << target_node  << std::endl; )
         void* ptr; // ugly stuff to wrap result into MSG_SIZE buffer TODO(improvement): change to transfering only actual result size by using delimiter and read_until in recv_result()
         posix_memalign((void**)&ptr, constants::CACHE_LINE_SIZE, constants::MSG_SIZE);
         memcpy(ptr, message, size);

From b81187bbcae6990ebefce3c83124bf3f8a3e5a6a Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Tue, 6 Nov 2018 21:39:20 +0100
Subject: [PATCH 121/150] proper asio connetion teardown

---
 include/ham/net/communicator_tcp.hpp | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index 33332fd..3e43df2 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -311,15 +311,14 @@ class communicator { // : public std::enable_shared_from_this<communicator>
             HAM_DEBUG( HAM_LOG << "communicator::communicator(): initializing buffers done" << std::endl; )
 
 			// host runs io_context in separate thread (asynchronous progress thread) for async operations
-
-			std::thread thread([this](){
+            work = boost::asio::make_work_guard(io_context);
+			thread = std::thread([this](){
                 HAM_DEBUG( HAM_LOG << "ASYNC THREAD: Heyooo, I live." << std::endl; )
-                boost::asio::io_service::work work(io_context);
                 io_context.run();
                 HAM_DEBUG( HAM_LOG << "ASYNC THREAD: Oh noes, I'm dead!" << std::endl; )
                 }
             );
-            thread.detach();
+            // thread.detach(); no longer needed with member thread
 
             HAM_DEBUG( HAM_LOG << "communicator::communicator(): async thread started" << std::endl; )
 		}
@@ -332,8 +331,10 @@ class communicator { // : public std::enable_shared_from_this<communicator>
 	{
 		// finalize
 		if(is_host()) {
-			io_context.stop();
+			work.reset();
+            thread.join();
 		}
+        io_context.stop();
 		HAM_DEBUG( HAM_LOG << "~communicator" << std::endl; )
 	}
 
@@ -517,7 +518,9 @@ class communicator { // : public std::enable_shared_from_this<communicator>
 	std::string host_port_;
     node_descriptor node_desc_dummy;
 	boost::asio::io_service io_context;
-
+    std::thread_ thread;
+    //boost::asio::io_service::work work; //1.65 syntax
+    boost::asio::executor_work_guard<boost::asio::io_context::executor_type> work;
 	struct tcp_peer {
 		buffer_ptr<msg_buffer> msg_buffers; // buffers used for MPI_ISend and IRecv by the sender
 

From 09307f75c0d8e456e92659df7cb4ab7e68b1caa6 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Tue, 6 Nov 2018 21:41:16 +0100
Subject: [PATCH 122/150] proper asio connetion teardown

---
 include/ham/net/communicator_tcp.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index 3e43df2..7049661 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -311,8 +311,8 @@ class communicator { // : public std::enable_shared_from_this<communicator>
             HAM_DEBUG( HAM_LOG << "communicator::communicator(): initializing buffers done" << std::endl; )
 
 			// host runs io_context in separate thread (asynchronous progress thread) for async operations
-            work = boost::asio::make_work_guard(io_context);
-			thread = std::thread([this](){
+            work_ = boost::asio::make_work_guard(io_context);
+			thread_ = std::thread([this](){
                 HAM_DEBUG( HAM_LOG << "ASYNC THREAD: Heyooo, I live." << std::endl; )
                 io_context.run();
                 HAM_DEBUG( HAM_LOG << "ASYNC THREAD: Oh noes, I'm dead!" << std::endl; )
@@ -520,7 +520,7 @@ class communicator { // : public std::enable_shared_from_this<communicator>
 	boost::asio::io_service io_context;
     std::thread_ thread;
     //boost::asio::io_service::work work; //1.65 syntax
-    boost::asio::executor_work_guard<boost::asio::io_context::executor_type> work;
+    boost::asio::executor_work_guard<boost::asio::io_context::executor_type> work_;
 	struct tcp_peer {
 		buffer_ptr<msg_buffer> msg_buffers; // buffers used for MPI_ISend and IRecv by the sender
 

From 2da089a6291fe0019c8f6ae82a2949aa005b847d Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Tue, 6 Nov 2018 21:42:33 +0100
Subject: [PATCH 123/150] proper asio connetion teardown

---
 include/ham/net/communicator_tcp.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index 7049661..09885ab 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -318,7 +318,7 @@ class communicator { // : public std::enable_shared_from_this<communicator>
                 HAM_DEBUG( HAM_LOG << "ASYNC THREAD: Oh noes, I'm dead!" << std::endl; )
                 }
             );
-            // thread.detach(); no longer needed with member thread
+            // thread_.detach(); no longer needed with member thread
 
             HAM_DEBUG( HAM_LOG << "communicator::communicator(): async thread started" << std::endl; )
 		}
@@ -332,7 +332,7 @@ class communicator { // : public std::enable_shared_from_this<communicator>
 		// finalize
 		if(is_host()) {
 			work.reset();
-            thread.join();
+            thread_.join();
 		}
         io_context.stop();
 		HAM_DEBUG( HAM_LOG << "~communicator" << std::endl; )

From e56004e8d0c5449275a19ededd92f208bb17a780 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Tue, 6 Nov 2018 21:43:50 +0100
Subject: [PATCH 124/150] proper asio connetion teardown

---
 include/ham/net/communicator_tcp.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index 09885ab..431b5be 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -518,7 +518,7 @@ class communicator { // : public std::enable_shared_from_this<communicator>
 	std::string host_port_;
     node_descriptor node_desc_dummy;
 	boost::asio::io_service io_context;
-    std::thread_ thread;
+    std::thread thread_;
     //boost::asio::io_service::work work; //1.65 syntax
     boost::asio::executor_work_guard<boost::asio::io_context::executor_type> work_;
 	struct tcp_peer {

From fa9120ae7bffa07fa9a96aa74f14500789d7036f Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Tue, 6 Nov 2018 21:44:40 +0100
Subject: [PATCH 125/150] proper asio connetion teardown

---
 include/ham/net/communicator_tcp.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index 431b5be..5e8b51f 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -331,7 +331,7 @@ class communicator { // : public std::enable_shared_from_this<communicator>
 	{
 		// finalize
 		if(is_host()) {
-			work.reset();
+			work_.reset();
             thread_.join();
 		}
         io_context.stop();

From 1671a121eb8628813a5ab4bdbe3a93c9e836a997 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Tue, 6 Nov 2018 22:18:00 +0100
Subject: [PATCH 126/150] changed work guard

---
 include/ham/net/communicator_tcp.hpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index 5e8b51f..aaf3a8b 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -311,7 +311,7 @@ class communicator { // : public std::enable_shared_from_this<communicator>
             HAM_DEBUG( HAM_LOG << "communicator::communicator(): initializing buffers done" << std::endl; )
 
 			// host runs io_context in separate thread (asynchronous progress thread) for async operations
-            work_ = boost::asio::make_work_guard(io_context);
+            work_ = boost::asio::io_service::work(io_context);
 			thread_ = std::thread([this](){
                 HAM_DEBUG( HAM_LOG << "ASYNC THREAD: Heyooo, I live." << std::endl; )
                 io_context.run();
@@ -519,9 +519,10 @@ class communicator { // : public std::enable_shared_from_this<communicator>
     node_descriptor node_desc_dummy;
 	boost::asio::io_service io_context;
     std::thread thread_;
-    //boost::asio::io_service::work work; //1.65 syntax
-    boost::asio::executor_work_guard<boost::asio::io_context::executor_type> work_;
-	struct tcp_peer {
+    boost::asio::io_service::work work_; //1.65 syntax
+    //boost::asio::executor_work_guard<boost::asio::io_context::executor_type> work_; // 1.66 syntax
+
+    struct tcp_peer {
 		buffer_ptr<msg_buffer> msg_buffers; // buffers used for MPI_ISend and IRecv by the sender
 
 		// needed by sender to manage which buffers are in use and which are free

From 1adea8c9db6f3665b75df64a1ed09576ea8c54ab Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Tue, 6 Nov 2018 22:31:10 +0100
Subject: [PATCH 127/150] changed work guard

---
 include/ham/net/communicator_tcp.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index aaf3a8b..017c5f0 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -311,9 +311,9 @@ class communicator { // : public std::enable_shared_from_this<communicator>
             HAM_DEBUG( HAM_LOG << "communicator::communicator(): initializing buffers done" << std::endl; )
 
 			// host runs io_context in separate thread (asynchronous progress thread) for async operations
-            work_ = boost::asio::io_service::work(io_context);
 			thread_ = std::thread([this](){
                 HAM_DEBUG( HAM_LOG << "ASYNC THREAD: Heyooo, I live." << std::endl; )
+                work_ = boost::asio::io_service::work(io_context);
                 io_context.run();
                 HAM_DEBUG( HAM_LOG << "ASYNC THREAD: Oh noes, I'm dead!" << std::endl; )
                 }

From 5431c8e91b9b3faaa83069984be23c4537ff1fbe Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Tue, 6 Nov 2018 22:35:31 +0100
Subject: [PATCH 128/150] changed work guard

---
 include/ham/net/communicator_tcp.hpp | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index 017c5f0..daad480 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -313,7 +313,7 @@ class communicator { // : public std::enable_shared_from_this<communicator>
 			// host runs io_context in separate thread (asynchronous progress thread) for async operations
 			thread_ = std::thread([this](){
                 HAM_DEBUG( HAM_LOG << "ASYNC THREAD: Heyooo, I live." << std::endl; )
-                work_ = boost::asio::io_service::work(io_context);
+                boost::asio::io_service::work work(io_context);
                 io_context.run();
                 HAM_DEBUG( HAM_LOG << "ASYNC THREAD: Oh noes, I'm dead!" << std::endl; )
                 }
@@ -330,12 +330,11 @@ class communicator { // : public std::enable_shared_from_this<communicator>
 	~communicator()
 	{
 		// finalize
-		if(is_host()) {
-			work_.reset();
-            thread_.join();
-		}
         io_context.stop();
-		HAM_DEBUG( HAM_LOG << "~communicator" << std::endl; )
+        if(is_host()) {
+            thread_.join();
+        }
+        HAM_DEBUG( HAM_LOG << "~communicator" << std::endl; )
 	}
 
 
@@ -519,8 +518,6 @@ class communicator { // : public std::enable_shared_from_this<communicator>
     node_descriptor node_desc_dummy;
 	boost::asio::io_service io_context;
     std::thread thread_;
-    boost::asio::io_service::work work_; //1.65 syntax
-    //boost::asio::executor_work_guard<boost::asio::io_context::executor_type> work_; // 1.66 syntax
 
     struct tcp_peer {
 		buffer_ptr<msg_buffer> msg_buffers; // buffers used for MPI_ISend and IRecv by the sender

From 9b9d5d4ed669a0062351e03a2bb24b8ab03e4647 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Tue, 6 Nov 2018 22:55:51 +0100
Subject: [PATCH 129/150] workaround to prevent tcp target from crashing due to
 connection closing before terminate functor is transmitted and executed

---
 include/ham/offload/offload.hpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/include/ham/offload/offload.hpp b/include/ham/offload/offload.hpp
index 7632a1b..9a66c9d 100644
--- a/include/ham/offload/offload.hpp
+++ b/include/ham/offload/offload.hpp
@@ -197,6 +197,9 @@ void ping(node_t node, Functor&& func)
 	HAM_DEBUG( HAM_LOG << "runtime::ping(): sending msg..." << std::endl; )
 	net::communicator::request req = comm.allocate_request(node); // TODO(improvement): resource deallocation of this request (currently only used for terminating)
 	comm.send_msg(req, (void*)&msg, sizeof msg);
+#if defined(HAM_COMM_TCP)
+    while(!req.sent())		// ugly workaround to prevent target from crashing because of connection teardown before the terminate functor is executed
+#fi
 	HAM_DEBUG( HAM_LOG << "runtime::ping(): sending msg done." << std::endl; )
 }
 

From 940c8b7f68ecee6a6d24242d7b0e14ca6c1a443c Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Tue, 6 Nov 2018 22:56:54 +0100
Subject: [PATCH 130/150] workaround to prevent tcp target from crashing due to
 connection closing before terminate functor is transmitted and executed

---
 include/ham/offload/offload.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ham/offload/offload.hpp b/include/ham/offload/offload.hpp
index 9a66c9d..43a8c98 100644
--- a/include/ham/offload/offload.hpp
+++ b/include/ham/offload/offload.hpp
@@ -199,7 +199,7 @@ void ping(node_t node, Functor&& func)
 	comm.send_msg(req, (void*)&msg, sizeof msg);
 #if defined(HAM_COMM_TCP)
     while(!req.sent())		// ugly workaround to prevent target from crashing because of connection teardown before the terminate functor is executed
-#fi
+#endfi
 	HAM_DEBUG( HAM_LOG << "runtime::ping(): sending msg done." << std::endl; )
 }
 

From 82dc863d97fb7748c54111adf4d6364689e68c74 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Tue, 6 Nov 2018 22:57:27 +0100
Subject: [PATCH 131/150] workaround to prevent tcp target from crashing due to
 connection closing before terminate functor is transmitted and executed

---
 include/ham/offload/offload.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ham/offload/offload.hpp b/include/ham/offload/offload.hpp
index 43a8c98..6a01569 100644
--- a/include/ham/offload/offload.hpp
+++ b/include/ham/offload/offload.hpp
@@ -199,7 +199,7 @@ void ping(node_t node, Functor&& func)
 	comm.send_msg(req, (void*)&msg, sizeof msg);
 #if defined(HAM_COMM_TCP)
     while(!req.sent())		// ugly workaround to prevent target from crashing because of connection teardown before the terminate functor is executed
-#endfi
+#endif
 	HAM_DEBUG( HAM_LOG << "runtime::ping(): sending msg done." << std::endl; )
 }
 

From 5c0ba9af595c249322572733e47ead69a65d73e4 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Tue, 6 Nov 2018 23:25:58 +0100
Subject: [PATCH 132/150] workaround to prevent tcp target from crashing due to
 connection closing before terminate functor is transmitted and executed

---
 include/ham/offload/offload.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ham/offload/offload.hpp b/include/ham/offload/offload.hpp
index 6a01569..00db9f6 100644
--- a/include/ham/offload/offload.hpp
+++ b/include/ham/offload/offload.hpp
@@ -198,7 +198,7 @@ void ping(node_t node, Functor&& func)
 	net::communicator::request req = comm.allocate_request(node); // TODO(improvement): resource deallocation of this request (currently only used for terminating)
 	comm.send_msg(req, (void*)&msg, sizeof msg);
 #if defined(HAM_COMM_TCP)
-    while(!req.sent())		// ugly workaround to prevent target from crashing because of connection teardown before the terminate functor is executed
+    while(!req.sent())	{}	// ugly workaround to prevent target from crashing because of connection teardown before the terminate functor is executed
 #endif
 	HAM_DEBUG( HAM_LOG << "runtime::ping(): sending msg done." << std::endl; )
 }

From 589fdd09fae220955c338d891473ecb5a63a7dfe Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Tue, 6 Nov 2018 23:45:08 +0100
Subject: [PATCH 133/150] workaround to prevent tcp target from crashing due to
 connection closing before terminate functor is transmitted and executed

---
 include/ham/net/communicator_tcp.hpp | 4 ++++
 include/ham/offload/offload.hpp      | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index daad480..2a98be9 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -123,6 +123,10 @@ class communicator { // : public std::enable_shared_from_this<communicator>
 			return sent_;
 		}
 
+        void wait_sent() const {
+            while(!sent_);
+        }
+
         node_t target_node;
 		node_t source_node;
 		bool valid_;
diff --git a/include/ham/offload/offload.hpp b/include/ham/offload/offload.hpp
index 00db9f6..221693c 100644
--- a/include/ham/offload/offload.hpp
+++ b/include/ham/offload/offload.hpp
@@ -198,7 +198,7 @@ void ping(node_t node, Functor&& func)
 	net::communicator::request req = comm.allocate_request(node); // TODO(improvement): resource deallocation of this request (currently only used for terminating)
 	comm.send_msg(req, (void*)&msg, sizeof msg);
 #if defined(HAM_COMM_TCP)
-    while(!req.sent())	{}	// ugly workaround to prevent target from crashing because of connection teardown before the terminate functor is executed
+    req.wait_sent();	// ugly workaround to prevent target from crashing because of connection teardown before the terminate functor is executed
 #endif
 	HAM_DEBUG( HAM_LOG << "runtime::ping(): sending msg done." << std::endl; )
 }

From 476857d9b5ab6a0ce230c6b7ea9a7e778dfd7de8 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Wed, 7 Nov 2018 00:01:50 +0100
Subject: [PATCH 134/150] testing work guard

---
 include/ham/net/communicator_tcp.hpp | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index 2a98be9..b7a15ed 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -124,7 +124,7 @@ class communicator { // : public std::enable_shared_from_this<communicator>
 		}
 
         void wait_sent() const {
-            while(!sent_);
+            while(!sent_) {};
         }
 
         node_t target_node;
@@ -315,9 +315,10 @@ class communicator { // : public std::enable_shared_from_this<communicator>
             HAM_DEBUG( HAM_LOG << "communicator::communicator(): initializing buffers done" << std::endl; )
 
 			// host runs io_context in separate thread (asynchronous progress thread) for async operations
-			thread_ = std::thread([this](){
+            boost::asio::io_service::work work(io_context);
+            thread_ = std::thread([this](){
                 HAM_DEBUG( HAM_LOG << "ASYNC THREAD: Heyooo, I live." << std::endl; )
-                boost::asio::io_service::work work(io_context);
+                // TODO(bug fix): need to figure out how to reset work from main thread so the background thread can return from run() before the host killst the io_context
                 io_context.run();
                 HAM_DEBUG( HAM_LOG << "ASYNC THREAD: Oh noes, I'm dead!" << std::endl; )
                 }
@@ -333,7 +334,12 @@ class communicator { // : public std::enable_shared_from_this<communicator>
 
 	~communicator()
 	{
-		// finalize
+		// TODO(bug fix): what we actually want:
+        // stop the work guard, so the thread will return from io_context.run() when all outstanding ops completed
+        // join the thread so the host waits until above is done
+        // stop the context
+        // currently: have to kill the context first because otherwise the thread wont complete to be joined
+        // but this causes thread to abandon any outstanding ops
         io_context.stop();
         if(is_host()) {
             thread_.join();

From 2939029536713d81182f9343796a76faac66a0a0 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Wed, 7 Nov 2018 00:03:26 +0100
Subject: [PATCH 135/150] testing work guard

---
 include/ham/net/communicator_tcp.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index b7a15ed..1591948 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -316,7 +316,7 @@ class communicator { // : public std::enable_shared_from_this<communicator>
 
 			// host runs io_context in separate thread (asynchronous progress thread) for async operations
             boost::asio::io_service::work work(io_context);
-            thread_ = std::thread([this](){
+            thread_ = std::thread([this, &work](){
                 HAM_DEBUG( HAM_LOG << "ASYNC THREAD: Heyooo, I live." << std::endl; )
                 // TODO(bug fix): need to figure out how to reset work from main thread so the background thread can return from run() before the host killst the io_context
                 io_context.run();

From 442bc34a96880c304901518631be564caebbbdbb Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Wed, 7 Nov 2018 00:06:05 +0100
Subject: [PATCH 136/150] testing work guard

---
 include/ham/net/communicator_tcp.hpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index 1591948..2d666a2 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -315,10 +315,10 @@ class communicator { // : public std::enable_shared_from_this<communicator>
             HAM_DEBUG( HAM_LOG << "communicator::communicator(): initializing buffers done" << std::endl; )
 
 			// host runs io_context in separate thread (asynchronous progress thread) for async operations
-            boost::asio::io_service::work work(io_context);
             thread_ = std::thread([this, &work](){
                 HAM_DEBUG( HAM_LOG << "ASYNC THREAD: Heyooo, I live." << std::endl; )
                 // TODO(bug fix): need to figure out how to reset work from main thread so the background thread can return from run() before the host killst the io_context
+                work_ = boost::asio::io_service::work(io_context);
                 io_context.run();
                 HAM_DEBUG( HAM_LOG << "ASYNC THREAD: Oh noes, I'm dead!" << std::endl; )
                 }
@@ -528,6 +528,7 @@ class communicator { // : public std::enable_shared_from_this<communicator>
     node_descriptor node_desc_dummy;
 	boost::asio::io_service io_context;
     std::thread thread_;
+    boost::asio::io_service::work work_;
 
     struct tcp_peer {
 		buffer_ptr<msg_buffer> msg_buffers; // buffers used for MPI_ISend and IRecv by the sender

From dec53c8869cf6d447bae83279572be82a0e62a13 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Wed, 7 Nov 2018 00:18:32 +0100
Subject: [PATCH 137/150] testing work guard

---
 include/ham/net/communicator_tcp.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index 2d666a2..b99be8a 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -315,10 +315,10 @@ class communicator { // : public std::enable_shared_from_this<communicator>
             HAM_DEBUG( HAM_LOG << "communicator::communicator(): initializing buffers done" << std::endl; )
 
 			// host runs io_context in separate thread (asynchronous progress thread) for async operations
-            thread_ = std::thread([this, &work](){
+            thread_ = std::thread([this](){
                 HAM_DEBUG( HAM_LOG << "ASYNC THREAD: Heyooo, I live." << std::endl; )
                 // TODO(bug fix): need to figure out how to reset work from main thread so the background thread can return from run() before the host killst the io_context
-                work_ = boost::asio::io_service::work(io_context);
+                boost::asio::io_service::work work(io_context);
                 io_context.run();
                 HAM_DEBUG( HAM_LOG << "ASYNC THREAD: Oh noes, I'm dead!" << std::endl; )
                 }

From 9b8d7ca9acabaaec484761bcd0d12f1daee67038 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Wed, 7 Nov 2018 00:20:00 +0100
Subject: [PATCH 138/150] testing work guard

---
 include/ham/net/communicator_tcp.hpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index b99be8a..ce7d4ba 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -528,7 +528,6 @@ class communicator { // : public std::enable_shared_from_this<communicator>
     node_descriptor node_desc_dummy;
 	boost::asio::io_service io_context;
     std::thread thread_;
-    boost::asio::io_service::work work_;
 
     struct tcp_peer {
 		buffer_ptr<msg_buffer> msg_buffers; // buffers used for MPI_ISend and IRecv by the sender

From 426f10ab4db10832baa26c3c79f72614429a5d95 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Wed, 7 Nov 2018 00:33:27 +0100
Subject: [PATCH 139/150] implemented copy

---
 include/ham/offload/offload.hpp | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/include/ham/offload/offload.hpp b/include/ham/offload/offload.hpp
index 221693c..26d7d6a 100644
--- a/include/ham/offload/offload.hpp
+++ b/include/ham/offload/offload.hpp
@@ -332,12 +332,12 @@ future<void> copy(buffer_ptr<T> source, buffer_ptr<T> dest, size_t n)
 }
 #endif
 
-#if !defined(HAM_COMM_ONE_SIDED) || !defined(HAM_COMM_TCP)// TODO(feature, high priority): implement
+#if !defined(HAM_COMM_ONE_SIDED)// TODO(feature, high priority): implement
         template<typename T>
 void copy_sync(buffer_ptr<T> source, buffer_ptr<T> dest, size_t n)
 {
 	net::communicator& comm = runtime::instance().communicator();
-#if  defined(HAM_COMM_ONE_SIDED) || defined(HAM_COMM_TCP)
+#if  defined(HAM_COMM_ONE_SIDED)
 // TODO(feature, high priority): implement
 // fix 1st arg:
 //	comm.send_data(src_node, local_source, remote_dest, n);
@@ -364,6 +364,11 @@ void copy_sync(buffer_ptr<T> source, buffer_ptr<T> dest, size_t n)
 #elif defined HAM_COMM_MPI_RMA_DYNAMIC
     // use async copy + sync
     copy(source, dest, n).get();
+#elif defined HAM_COMM_TCP
+	void* ptr;
+	posix_memalign(&ptr, constants::CACHE_LINE_SIZE, n * sizeof(T));
+	get_sync(source, ptr, n*sizeof(T));
+	put_sync(ptr, dest,n*sizeof(T));
 #endif
 }
 

From c75e7d9a60af6170530c60c4c9ee60665a740e3f Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Wed, 7 Nov 2018 00:36:30 +0100
Subject: [PATCH 140/150] implemented copy

---
 include/ham/offload/offload.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/ham/offload/offload.hpp b/include/ham/offload/offload.hpp
index 26d7d6a..dafb6da 100644
--- a/include/ham/offload/offload.hpp
+++ b/include/ham/offload/offload.hpp
@@ -365,10 +365,10 @@ void copy_sync(buffer_ptr<T> source, buffer_ptr<T> dest, size_t n)
     // use async copy + sync
     copy(source, dest, n).get();
 #elif defined HAM_COMM_TCP
-	void* ptr;
-	posix_memalign(&ptr, constants::CACHE_LINE_SIZE, n * sizeof(T));
-	get_sync(source, ptr, n*sizeof(T));
-	put_sync(ptr, dest,n*sizeof(T));
+	T* ptr;
+	posix_memalign((void**)&ptr, constants::CACHE_LINE_SIZE, n * sizeof(T));
+	get_sync(source, ptr, n);
+	put_sync(ptr, dest,n);
 #endif
 }
 

From d3599d86ce6982a2134833f72372b59e3cf9482c Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Thu, 8 Nov 2018 15:37:16 +0100
Subject: [PATCH 141/150] change dynamic array init for compatibility with
 clang

---
 include/ham/net/communicator_tcp.hpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index ce7d4ba..5e53710 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -248,7 +248,10 @@ class communicator { // : public std::enable_shared_from_this<communicator>
                 temp_socks[l] = new tcp::socket(io_context);
             }
 
-			bool taken_ranks[nodes_] {false};
+			bool taken_ranks[nodes_];
+            for (int x = 0; x < nodex_; ++x) {
+                taken_ranks[x]= false;
+            }
 			taken_ranks[0] = true; // host rank has to be correctly provided and is therefore already taken (by the executing process)
 
 			for(int i=1; i < nodes_; i++) {

From 67f92365d5286f65644aa50758ed254604b16606 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Thu, 8 Nov 2018 15:38:15 +0100
Subject: [PATCH 142/150] change dynamic array init for compatibility with
 clang

---
 include/ham/net/communicator_tcp.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ham/net/communicator_tcp.hpp b/include/ham/net/communicator_tcp.hpp
index 5e53710..888dfb7 100644
--- a/include/ham/net/communicator_tcp.hpp
+++ b/include/ham/net/communicator_tcp.hpp
@@ -249,7 +249,7 @@ class communicator { // : public std::enable_shared_from_this<communicator>
             }
 
 			bool taken_ranks[nodes_];
-            for (int x = 0; x < nodex_; ++x) {
+            for (int x = 0; x < nodes_; ++x) {
                 taken_ranks[x]= false;
             }
 			taken_ranks[0] = true; // host rank has to be correctly provided and is therefore already taken (by the executing process)

From e60f0dbe6c4b091c4369e62e3eaf7802791c8f15 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Fri, 9 Nov 2018 18:48:35 +0100
Subject: [PATCH 143/150] added streams

---
 CMakeLists.txt                 |   2 +-
 include/ham/misc/types.hpp     |   1 +
 include/ham/offload/stream.hpp | 157 +++++++++++++++++++++++++++++++++
 src/CMakeLists.txt             |   3 +
 src/ham/CMakeLists.txt         |   2 +
 src/test_streams.cpp           |  94 ++++++++++++++++++++
 6 files changed, 258 insertions(+), 1 deletion(-)
 create mode 100644 include/ham/offload/stream.hpp
 create mode 100644 src/test_streams.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 30f3dbd..5c48af8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -41,7 +41,7 @@ else ()
 endif ()
 
 # tell the compiler to be strict
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -hstd=c++11")
 set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DHAM_DEBUG_ON")
 
 add_subdirectory(thirdparty/bmt ${CMAKE_CURRENT_BINARY_DIR}/build.noma_bmt)
diff --git a/include/ham/misc/types.hpp b/include/ham/misc/types.hpp
index 1b8393d..cf4d7e7 100644
--- a/include/ham/misc/types.hpp
+++ b/include/ham/misc/types.hpp
@@ -13,6 +13,7 @@ namespace ham {
 
 typedef size_t node_t; // node type, e.g. MPI rank, identifies remote target process
 typedef size_t flag_t; // MPI RMA completion flag / buffer index
+typedef char byte_t;
 typedef char*  msg_buffer_t; // buffer type for messages
 
 namespace net {
diff --git a/include/ham/offload/stream.hpp b/include/ham/offload/stream.hpp
new file mode 100644
index 0000000..2a0f477
--- /dev/null
+++ b/include/ham/offload/stream.hpp
@@ -0,0 +1,157 @@
+// Copyright (c) 2013-2014 Matthias Noack (ma.noack.pr@gmail.com)
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+// something for "requires ham_offload_hpp"
+
+#ifndef ham_offload_stream_hpp
+#define ham_offload_stream_hpp
+
+#include "ham/net/communicator.hpp"
+
+#include <sstream>
+#include <string>
+
+#include "ham/misc/types.hpp"
+#include "ham/functor/buffer.hpp"
+#include "ham/offload/offload_msg.hpp"
+#include "ham/offload/offload.hpp"
+#include "ham/offload/runtime.hpp"
+#include "ham/util/at_end_of_scope_do.hpp"
+#include "ham/util/debug.hpp"
+#include "ham/util/log.hpp"
+
+
+namespace ham {
+namespace offload {
+namespace stream {
+
+using ::ham::net::buffer_ptr;
+using ::ham::node_t;
+using ::ham::byte_t;
+
+class ostream;
+
+class stream_base {
+public:
+	stream_base(node_t target) : target_(target) {}
+
+	stream_base(node_t target, buffer_ptr<byte_t> buffer, size_t size) : target_(target), buffer_(buffer),
+	                                                                     size_(size) {}
+	// put common stuff of ostream/istream here
+
+	buffer_ptr<byte_t> buffer() { return buffer_; }
+
+	void buffer(buffer_ptr<byte_t> buffer) { buffer_ = buffer; }
+
+	size_t size() { return size_; }
+
+	void size(size_t size) { size_ = size; }
+
+	node_t target() { return target_; } // no setting intended
+protected:
+	node_t target_;
+	buffer_ptr<byte_t> buffer_; // remote sink, remote memory
+	size_t size_; // size of remote sink
+};
+
+class stream_proxy {
+
+	friend class istream;
+
+public:
+	stream_proxy(); // default contstuctor needed for return transport dummy entries
+	stream_proxy(stream_base *stream) : target_(stream->target()), buffer_(stream->buffer()),
+	                                    size_(stream->size()) {}
+
+private:
+	node_t target_;
+	buffer_ptr<byte_t> buffer_;
+	size_t size_;
+};
+
+class ostream : public stream_base, public std::ostringstream {
+
+public:
+	// always need the node associated with this stream
+	ostream(node_t target) : stream_base(target), std::ostringstream() {}
+
+	ostream(node_t target, size_t size) : stream_base(target), std::ostringstream(), fixed_(true) {
+		posix_memalign((void **) &fixed_ptr_, constants::CACHE_LINE_SIZE, size);
+		rdbuf()->pubsetbuf(fixed_ptr_, size);
+		// NOTE: this does NOT set the streams buffer or size. It will only associate a buffer that should be large enough to not need resizing (user's responsibility)
+		// if it should not be large enough, it may still be resized/reallocated
+	}
+
+	~ostream() {
+		if (fixed_) std::free((void *) fixed_ptr_);
+	}
+
+	const stream_proxy sync() {
+		std::string temp = rdbuf()->str(); // COPY ... no other option, direct pointers not accessible
+		if (ham::offload::is_host()) { // on host
+			buffer_ = offload::allocate<byte_t>(target_, temp.size());
+			size_ = temp.size();
+			offload::put_sync((byte_t *) temp.c_str(), buffer_, size_);
+			return stream_proxy(this);
+		} else { // on target
+			ham::net::communicator &comm = ham::offload::runtime::instance().communicator();
+			buffer_ = comm.allocate_buffer<byte_t>((size_t) temp.size(), ham::offload::this_node());
+			size_ = temp.size();
+			strcpy((char *) buffer_.get(),
+			       temp.c_str()); // COPY ... no other option, depending on backend we need the mem to be allocated by new_buffer
+			return stream_proxy(this);
+		}
+	}
+
+	// we reduce the dynamic here
+	/*
+	- use like a local in-memory stream, i.e. stringstream, maybe inherit stringstream, or output version
+		- ss.str().data() and size()
+		- on explicit synchronisation request from user
+			- allocate remote memory, set internal butter_ptr with known size
+			- put() data onto target
+	*/
+private:
+	bool fixed_ = false;
+	byte_t *fixed_ptr_ = nullptr;
+};
+
+
+class istream : public stream_base, public std::istringstream {
+public:
+	istream(const stream_proxy proxy) : stream_base(proxy.target_, proxy.buffer_, proxy.size_),
+	                                    std::istringstream() {
+		if (ham::offload::is_host()) {
+			posix_memalign((void **) &local_ptr_, constants::CACHE_LINE_SIZE, size_);
+			offload::get_sync(buffer_, local_ptr_, size_);
+			this->rdbuf()->pubsetbuf(local_ptr_, size_);
+		} else {
+			rdbuf()->pubsetbuf(buffer_.get(),
+			                   size_); // avoid a copy that would be necessary when using str(string) to set the content
+		}
+	}
+	// fail on underflow, set flags/state whatever, check std::istream interface
+
+	// maybe use stringstream and reconstruct from data_
+
+	~istream() {
+		if (ham::offload::is_host()) {
+			offload::free(buffer_);
+			std::free((void *) local_ptr_);
+		} else {
+			ham::net::communicator &comm = ham::offload::runtime::instance().communicator();
+			comm.free_buffer(buffer_); // this is where we trash "used" buffers on the targets
+		}
+	}
+
+private:
+	byte_t *local_ptr_ = nullptr;
+};
+
+
+} // namespace stream
+}
+} // namespace ham
+#endif // ham_offload_stream_hpp
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 8dbb21b..e32675b 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -71,6 +71,9 @@ if (MPI_FOUND)
 	add_executable(test_argument_transfer_mpi test_argument_transfer.cpp)
 	target_link_libraries(test_argument_transfer_mpi ham_offload_mpi)
 
+	add_executable(test_streams_mpi test_streams.cpp)
+	target_link_libraries(test_streams_mpi ham_offload_mpi)
+
 # RMA MPI
 
 	add_executable(ham_offload_test_mpi_rma_dynamic ham_offload.cpp)
diff --git a/src/ham/CMakeLists.txt b/src/ham/CMakeLists.txt
index 1652e1c..30ae5cd 100644
--- a/src/ham/CMakeLists.txt
+++ b/src/ham/CMakeLists.txt
@@ -18,6 +18,7 @@ set(HAM_LIB_SRC
 	net/communicator_mpi_rma_dynamic.cpp
 	offload/runtime.cpp
 	offload/offload.cpp
+	offload/stream.cpp
 	util/cpu_affinity.cpp)
 
 # TCP
@@ -26,6 +27,7 @@ add_library(ham_offload_tcp # SHARED if BUILD_SHARED_LIBS = TRUE
 		net/communicator_tcp.cpp
 		offload/runtime.cpp
 		offload/offload.cpp
+		offload/stream.cpp
 		offload/main.cpp
 		util/cpu_affinity.cpp)
 target_compile_definitions(ham_offload_tcp PUBLIC -DHAM_COMM_TCP=1)
diff --git a/src/test_streams.cpp b/src/test_streams.cpp
new file mode 100644
index 0000000..a5bfb5d
--- /dev/null
+++ b/src/test_streams.cpp
@@ -0,0 +1,94 @@
+// Copyright (c) 2013-2014 Matthias Noack (ma.noack.pr@gmail.com)
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include "ham/offload.hpp"
+#include "ham/offload/stream.hpp"
+
+#include "cereal/archives/binary.hpp"
+
+#include <array>
+#include <iostream>
+
+struct MyData {
+
+	char one[1024];
+	char two[1024];
+
+	template<class Archive>
+	void serialize(Archive & archive)
+	{
+		archive( one, two );
+	}
+};
+
+// alternative: nicer, with proxy
+// target
+ham::offload::stream::stream_proxy offloaded_fun(ham::offload::stream::stream_proxy osp)
+{
+	ham::offload::stream::istream his(osp); // NOTE: data is already on the target
+
+    MyData m1, m2, m3;
+    {
+        cereal::BinaryInputArchive iarchive(his); // Create an input archive
+
+		iarchive(m1, m2, m3); // Read the data from the archive
+	}
+
+	char* bla = "0123456789";
+	strcpy(m1.one, bla);
+	strcpy(m1.two, bla);
+	char* blub = "ABCDEFGHI";
+	strcpy(m2.one, blub);
+	strcpy(m2.two, blub);
+	strcpy(m2.one, bla);
+	strcpy(m2.two, blub);
+
+
+	ham::offload::stream::ostream hos(0);
+
+    {
+        cereal::BinaryOutputArchive oarchive(hos);
+        oarchive(m1, m2, m3);
+    }
+
+	auto out_proxy = hos.sync();
+
+    return out_proxy;
+}
+
+int main(int argc, char* argv[])
+{
+	ham::offload::node_t target = 1;
+
+	ham::offload::stream::ostream hos(target);
+
+	MyData m1, m2, m3; // could be out of scope, data to be transferred
+
+	{
+		cereal::BinaryOutputArchive oarchive(hos); // Create an output archive
+		oarchive(m1, m2, m3); // Write the data to the archive
+	} // archive goes out of scope, ensuring all contents are flushed
+	// after this scope, data from oarchive is flushed into the stream, stream can be used
+
+	auto out_proxy = hos.sync(); // trigger transfer to target (write has other meaning with streams)
+
+	auto in_proxy = ham::offload::sync(target, f2f(&offloaded_fun, out_proxy));
+
+    ham::offload::stream::istream his(in_proxy);
+
+	{
+		cereal::BinaryInputArchive iarchive(his);
+		iarchive(m1, m2, m3);
+	}
+
+	printf("%.10s\n", m1.one);
+	printf("%.10s\n", m1.two);
+	printf("%.10s\n", m2.one);
+	printf("%.10s\n", m2.two);
+	printf("%.10s\n", m3.one);
+	printf("%.10s\n", m3.two);
+	return 0;	
+}
+

From f6c9f73cb4e3b586a7a97255959a284548cc9a3e Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Fri, 9 Nov 2018 18:48:52 +0100
Subject: [PATCH 144/150] added streams

---
 src/ham/offload/stream.cpp | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 src/ham/offload/stream.cpp

diff --git a/src/ham/offload/stream.cpp b/src/ham/offload/stream.cpp
new file mode 100644
index 0000000..f33c7cf
--- /dev/null
+++ b/src/ham/offload/stream.cpp
@@ -0,0 +1,6 @@
+// Copyright (c) 2013-2014 Matthias Noack (ma.noack.pr@gmail.com)
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include "ham/offload/stream.hpp"

From c4bdabb38ab63e2a9b2babfd6f4111ad3be15f08 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Sat, 10 Nov 2018 16:31:46 +0100
Subject: [PATCH 145/150] changed test to output result too

---
 include/ham/offload/stream.hpp | 32 +++++++++++++++++++++++---------
 src/CMakeLists.txt             | 24 ++++++++++++------------
 src/ham/CMakeLists.txt         | 30 +++++++++++++++---------------
 src/test_argument_transfer.cpp |  2 +-
 4 files changed, 51 insertions(+), 37 deletions(-)

diff --git a/include/ham/offload/stream.hpp b/include/ham/offload/stream.hpp
index 2a0f477..ace80f2 100644
--- a/include/ham/offload/stream.hpp
+++ b/include/ham/offload/stream.hpp
@@ -61,9 +61,11 @@ class stream_proxy {
 	friend class istream;
 
 public:
-	stream_proxy(); // default contstuctor needed for return transport dummy entries
+	stream_proxy() = default; // default contstuctor needed for return transport dummy entries
 	stream_proxy(stream_base *stream) : target_(stream->target()), buffer_(stream->buffer()),
-	                                    size_(stream->size()) {}
+	                                    size_(stream->size()) {
+		HAM_DEBUG( HAM_LOG << "stream_proxy::ctor() called" << std::endl; )
+	}
 
 private:
 	node_t target_;
@@ -75,32 +77,40 @@ class ostream : public stream_base, public std::ostringstream {
 
 public:
 	// always need the node associated with this stream
-	ostream(node_t target) : stream_base(target), std::ostringstream() {}
+	ostream(node_t target) : stream_base(target), std::ostringstream() {
+		HAM_DEBUG( HAM_LOG << "ostream::ctor() called" << std::endl; )
+	}
 
 	ostream(node_t target, size_t size) : stream_base(target), std::ostringstream(), fixed_(true) {
 		posix_memalign((void **) &fixed_ptr_, constants::CACHE_LINE_SIZE, size);
 		rdbuf()->pubsetbuf(fixed_ptr_, size);
+		HAM_DEBUG( HAM_LOG << "ostream::ctor() for provided buffer size called" << std::endl; )
 		// NOTE: this does NOT set the streams buffer or size. It will only associate a buffer that should be large enough to not need resizing (user's responsibility)
 		// if it should not be large enough, it may still be resized/reallocated
 	}
 
 	~ostream() {
 		if (fixed_) std::free((void *) fixed_ptr_);
+		HAM_DEBUG( HAM_LOG << "ostream::dtor()" << std::endl; )
 	}
 
 	const stream_proxy sync() {
 		std::string temp = rdbuf()->str(); // COPY ... no other option, direct pointers not accessible
 		if (ham::offload::is_host()) { // on host
-			buffer_ = offload::allocate<byte_t>(target_, temp.size());
+			HAM_DEBUG( HAM_LOG << "host executing ostream::sync()" << std::endl; )
 			size_ = temp.size();
+			buffer_ = offload::allocate<byte_t>(target_, size_);
+			HAM_DEBUG( HAM_LOG << "ostream::sync() allocated buffer @" << target_ << std::endl; )
 			offload::put_sync((byte_t *) temp.c_str(), buffer_, size_);
+			HAM_DEBUG( HAM_LOG << "ostream::sync() sent data to " << target_ << std::endl; )
 			return stream_proxy(this);
 		} else { // on target
+			HAM_DEBUG( HAM_LOG << "target executing ostream::sync()" << std::endl; )
 			ham::net::communicator &comm = ham::offload::runtime::instance().communicator();
-			buffer_ = comm.allocate_buffer<byte_t>((size_t) temp.size(), ham::offload::this_node());
 			size_ = temp.size();
-			strcpy((char *) buffer_.get(),
-			       temp.c_str()); // COPY ... no other option, depending on backend we need the mem to be allocated by new_buffer
+			buffer_ = comm.allocate_buffer<byte_t>(size_, ham::offload::this_node());
+			HAM_DEBUG( HAM_LOG << "ostream::sync() allocated local buffer" << std::endl; )
+			strcpy((char *) buffer_.get(), temp.c_str()); // COPY ... no other option, depending on backend we need the mem to be allocated by new_buffer
 			return stream_proxy(this);
 		}
 	}
@@ -123,13 +133,15 @@ class istream : public stream_base, public std::istringstream {
 public:
 	istream(const stream_proxy proxy) : stream_base(proxy.target_, proxy.buffer_, proxy.size_),
 	                                    std::istringstream() {
+		HAM_DEBUG( HAM_LOG << "istream::ctor() called with stream_proxy" << target_ << std::endl; )
 		if (ham::offload::is_host()) {
 			posix_memalign((void **) &local_ptr_, constants::CACHE_LINE_SIZE, size_);
 			offload::get_sync(buffer_, local_ptr_, size_);
+			HAM_DEBUG( HAM_LOG << "istream::sync() host retrieved data from " << buffer_.node() << std::endl; )
 			this->rdbuf()->pubsetbuf(local_ptr_, size_);
 		} else {
-			rdbuf()->pubsetbuf(buffer_.get(),
-			                   size_); // avoid a copy that would be necessary when using str(string) to set the content
+			rdbuf()->pubsetbuf(buffer_.get(), size_); // avoid a copy that would be necessary when using str(string) to set the content
+			HAM_DEBUG( HAM_LOG << "istream::sync() target set streambuffer to remote buffer" << target_ << std::endl; )
 		}
 	}
 	// fail on underflow, set flags/state whatever, check std::istream interface
@@ -139,10 +151,12 @@ class istream : public stream_base, public std::istringstream {
 	~istream() {
 		if (ham::offload::is_host()) {
 			offload::free(buffer_);
+			HAM_DEBUG( HAM_LOG << "istream::dtor() freed memory @" << target_ << std::endl; )
 			std::free((void *) local_ptr_);
 		} else {
 			ham::net::communicator &comm = ham::offload::runtime::instance().communicator();
 			comm.free_buffer(buffer_); // this is where we trash "used" buffers on the targets
+			HAM_DEBUG( HAM_LOG << "istream::dtor() freed local memory" << std::endl; )
 		}
 	}
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index e32675b..4a2dda3 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -13,8 +13,8 @@ add_subdirectory(ham)
 ## Explicit targets (not built by default)
 
 # TCP benchmarks
-add_executable(benchmark_ham_offload_tcp benchmark_ham_offload.cpp)
-target_link_libraries(benchmark_ham_offload_tcp ham_offload_tcp)
+# add_executable(benchmark_ham_offload_tcp benchmark_ham_offload.cpp)
+# target_link_libraries(benchmark_ham_offload_tcp ham_offload_tcp)
 
 # Intel LEO offload directive benchmark, requires Intel compiler
 if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
@@ -42,17 +42,17 @@ add_executable(active_msgs active_msgs.cpp)
 target_link_libraries(active_msgs ham_interface)
 
 # TCP tests
-add_executable(ham_offload_test_tcp ham_offload.cpp)
-target_link_libraries(ham_offload_test_tcp ham_offload_tcp)
+# add_executable(ham_offload_test_tcp ham_offload.cpp)
+# target_link_libraries(ham_offload_test_tcp ham_offload_tcp)
 
-add_executable(inner_product_tcp inner_product.cpp)
-target_link_libraries(inner_product_tcp ham_offload_tcp)
+# add_executable(inner_product_tcp inner_product.cpp)
+# target_link_libraries(inner_product_tcp ham_offload_tcp)
 
-add_executable(test_data_transfer_tcp test_data_transfer.cpp)
-target_link_libraries(test_data_transfer_tcp ham_offload_tcp)
+# add_executable(test_data_transfer_tcp test_data_transfer.cpp)
+# target_link_libraries(test_data_transfer_tcp ham_offload_tcp)
 
-add_executable(test_argument_transfer_tcp test_argument_transfer.cpp)
-target_link_libraries(test_argument_transfer_tcp ham_offload_tcp)
+# add_executable(test_argument_transfer_tcp test_argument_transfer.cpp)
+# target_link_libraries(test_argument_transfer_tcp ham_offload_tcp)
 
 if (MPI_FOUND)
 # two-sided MPI
@@ -71,8 +71,8 @@ if (MPI_FOUND)
 	add_executable(test_argument_transfer_mpi test_argument_transfer.cpp)
 	target_link_libraries(test_argument_transfer_mpi ham_offload_mpi)
 
-	add_executable(test_streams_mpi test_streams.cpp)
-	target_link_libraries(test_streams_mpi ham_offload_mpi)
+#	add_executable(test_streams_mpi test_streams.cpp)
+#	target_link_libraries(test_streams_mpi ham_offload_mpi)
 
 # RMA MPI
 
diff --git a/src/ham/CMakeLists.txt b/src/ham/CMakeLists.txt
index 30ae5cd..33a736c 100644
--- a/src/ham/CMakeLists.txt
+++ b/src/ham/CMakeLists.txt
@@ -18,25 +18,25 @@ set(HAM_LIB_SRC
 	net/communicator_mpi_rma_dynamic.cpp
 	offload/runtime.cpp
 	offload/offload.cpp
-	offload/stream.cpp
+#	offload/stream.cpp
 	util/cpu_affinity.cpp)
 
 # TCP
-add_library(ham_offload_tcp # SHARED if BUILD_SHARED_LIBS = TRUE
-		net/communicator.cpp
-		net/communicator_tcp.cpp
-		offload/runtime.cpp
-		offload/offload.cpp
-		offload/stream.cpp
-		offload/main.cpp
-		util/cpu_affinity.cpp)
-target_compile_definitions(ham_offload_tcp PUBLIC -DHAM_COMM_TCP=1)
-target_link_libraries(ham_offload_tcp PUBLIC ham_interface boost_library pthread)
+#add_library(ham_offload_tcp # SHARED if BUILD_SHARED_LIBS = TRUE
+#		net/communicator.cpp
+#		net/communicator_tcp.cpp
+#		offload/runtime.cpp
+#		offload/offload.cpp
+#		offload/stream.cpp
+#		offload/main.cpp
+#		util/cpu_affinity.cpp)
+#target_compile_definitions(ham_offload_tcp PUBLIC -DHAM_COMM_TCP=1)
+#target_link_libraries(ham_offload_tcp PUBLIC ham_interface boost_library pthread)
 
-set_target_properties(ham_offload_tcp PROPERTIES
-		CXX_STANDARD 11
-		CXX_STANDARD_REQUIRED YES
-		CXX_EXTENSIONS NO)
+#set_target_properties(ham_offload_tcp PROPERTIES
+#		CXX_STANDARD 11
+#		CXX_STANDARD_REQUIRED YES
+#		CXX_EXTENSIONS NO)
 
 if (MPI_FOUND)
 	add_library(ham_offload_mpi # SHARED if BUILD_SHARED_LIBS = TRUE
diff --git a/src/test_argument_transfer.cpp b/src/test_argument_transfer.cpp
index 97a693e..7712459 100644
--- a/src/test_argument_transfer.cpp
+++ b/src/test_argument_transfer.cpp
@@ -23,7 +23,7 @@ bool test_type_invokation(offload::node_t target, T arg)
 {
 	T result = offload::sync(target, f2f(&type_transfer_function<T>, arg));
 	bool passed = result == arg;
-	std::cout << "Result for type \"" << typeid(T).name() << "\": " << (passed ? "pass" : "fail") << std::endl;
+	std::cout << "Result for type \"" << typeid(T).name() << "\": " << arg << (passed ? " -> pass" : " -> fail") << std::endl;
 	return passed;
 }
 

From 96585b08b5ada7fbe65cf9560869d82d1a0b442e Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Tue, 13 Nov 2018 13:32:25 +0100
Subject: [PATCH 146/150] fix migratable use

---
 include/ham/misc/types.hpp | 4 +++-
 tools/install_boost.sh     | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/ham/misc/types.hpp b/include/ham/misc/types.hpp
index cf4d7e7..f50fa0e 100644
--- a/include/ham/misc/types.hpp
+++ b/include/ham/misc/types.hpp
@@ -9,6 +9,8 @@
 #include <algorithm>
 #include <cstddef>
 
+#include "ham/misc/migratable.hpp"
+
 namespace ham {
 
 typedef size_t node_t; // node type, e.g. MPI rank, identifies remote target process
@@ -31,7 +33,7 @@ class result_container
 	T get() { return T(std::move(res)); }
 
 private:
-	T res;
+	migratable<T> res;
 };
 
 template<>
diff --git a/tools/install_boost.sh b/tools/install_boost.sh
index 2a19297..e422feb 100755
--- a/tools/install_boost.sh
+++ b/tools/install_boost.sh
@@ -40,7 +40,7 @@ BASHRC_FILE=$HOME/dev/null # set to /dev/null to disable, or to any other file t
 
 BOOST_BUILD_OPTIONS="-j8" # concurrent build with up to 8 commands
 BOOST_NAME=boost
-BOOST_VERSION=1_65_1
+BOOST_VERSION=1_66_0
 BOOST_MIC_SUFFIX=mic
 BOOST_ARCHIVE=${BOOST_NAME}_${BOOST_VERSION} # NOTE: without tar.bz2
 

From 657302ee36bac1bfab8bfcd65f0e463f150eb523 Mon Sep 17 00:00:00 2001
From: Phuzzyhead <danieldeppisch@onlinehome.de>
Date: Wed, 14 Nov 2018 02:09:45 +0100
Subject: [PATCH 147/150] fixed streams

---
 CMakeLists.txt                 |  2 +-
 include/ham/offload/stream.hpp | 14 ++++-----
 src/CMakeLists.txt             |  4 +--
 src/ham/CMakeLists.txt         |  2 +-
 src/test_streams.cpp           | 55 ++++++++++++++++++++++++++--------
 5 files changed, 54 insertions(+), 23 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5c48af8..30f3dbd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -41,7 +41,7 @@ else ()
 endif ()
 
 # tell the compiler to be strict
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -hstd=c++11")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -DHAM_DEBUG_ON")
 
 add_subdirectory(thirdparty/bmt ${CMAKE_CURRENT_BINARY_DIR}/build.noma_bmt)
diff --git a/include/ham/offload/stream.hpp b/include/ham/offload/stream.hpp
index ace80f2..8225f14 100644
--- a/include/ham/offload/stream.hpp
+++ b/include/ham/offload/stream.hpp
@@ -100,17 +100,17 @@ class ostream : public stream_base, public std::ostringstream {
 			HAM_DEBUG( HAM_LOG << "host executing ostream::sync()" << std::endl; )
 			size_ = temp.size();
 			buffer_ = offload::allocate<byte_t>(target_, size_);
-			HAM_DEBUG( HAM_LOG << "ostream::sync() allocated buffer @" << target_ << std::endl; )
+			HAM_DEBUG( HAM_LOG << "ostream::sync() allocated buffer @" << target_ << " size: " << size_ << std::endl; )
 			offload::put_sync((byte_t *) temp.c_str(), buffer_, size_);
-			HAM_DEBUG( HAM_LOG << "ostream::sync() sent data to " << target_ << std::endl; )
+			HAM_DEBUG( HAM_LOG << "ostream::sync() sent data to " << target_ << " size: " << size_ << std::endl; )
 			return stream_proxy(this);
 		} else { // on target
 			HAM_DEBUG( HAM_LOG << "target executing ostream::sync()" << std::endl; )
 			ham::net::communicator &comm = ham::offload::runtime::instance().communicator();
 			size_ = temp.size();
 			buffer_ = comm.allocate_buffer<byte_t>(size_, ham::offload::this_node());
-			HAM_DEBUG( HAM_LOG << "ostream::sync() allocated local buffer" << std::endl; )
-			strcpy((char *) buffer_.get(), temp.c_str()); // COPY ... no other option, depending on backend we need the mem to be allocated by new_buffer
+			HAM_DEBUG( HAM_LOG << "ostream::sync() allocated local buffer size: " << size_ << std::endl; )
+			memcpy((char *) buffer_.get(), temp.c_str(), size_); // COPY ... no other option, depending on backend we need the mem to be allocated by new_buffer
 			return stream_proxy(this);
 		}
 	}
@@ -133,15 +133,15 @@ class istream : public stream_base, public std::istringstream {
 public:
 	istream(const stream_proxy proxy) : stream_base(proxy.target_, proxy.buffer_, proxy.size_),
 	                                    std::istringstream() {
-		HAM_DEBUG( HAM_LOG << "istream::ctor() called with stream_proxy" << target_ << std::endl; )
+		HAM_DEBUG( HAM_LOG << "istream::ctor() called with stream_proxy from: " << target_ << std::endl; )
 		if (ham::offload::is_host()) {
 			posix_memalign((void **) &local_ptr_, constants::CACHE_LINE_SIZE, size_);
 			offload::get_sync(buffer_, local_ptr_, size_);
-			HAM_DEBUG( HAM_LOG << "istream::sync() host retrieved data from " << buffer_.node() << std::endl; )
+			HAM_DEBUG( HAM_LOG << "istream::ctor() host retrieved data from " << buffer_.node() << " size: " << size_ << std::endl; )
 			this->rdbuf()->pubsetbuf(local_ptr_, size_);
 		} else {
 			rdbuf()->pubsetbuf(buffer_.get(), size_); // avoid a copy that would be necessary when using str(string) to set the content
-			HAM_DEBUG( HAM_LOG << "istream::sync() target set streambuffer to remote buffer" << target_ << std::endl; )
+			HAM_DEBUG( HAM_LOG << "istream::ctor() target set streambuffer to remote buffer" << target_ << " size: " << size_ << std::endl; )
 		}
 	}
 	// fail on underflow, set flags/state whatever, check std::istream interface
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 4a2dda3..f5dcdd7 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -71,8 +71,8 @@ if (MPI_FOUND)
 	add_executable(test_argument_transfer_mpi test_argument_transfer.cpp)
 	target_link_libraries(test_argument_transfer_mpi ham_offload_mpi)
 
-#	add_executable(test_streams_mpi test_streams.cpp)
-#	target_link_libraries(test_streams_mpi ham_offload_mpi)
+	add_executable(test_streams_mpi test_streams.cpp)
+	target_link_libraries(test_streams_mpi ham_offload_mpi)
 
 # RMA MPI
 
diff --git a/src/ham/CMakeLists.txt b/src/ham/CMakeLists.txt
index 33a736c..8108980 100644
--- a/src/ham/CMakeLists.txt
+++ b/src/ham/CMakeLists.txt
@@ -18,7 +18,7 @@ set(HAM_LIB_SRC
 	net/communicator_mpi_rma_dynamic.cpp
 	offload/runtime.cpp
 	offload/offload.cpp
-#	offload/stream.cpp
+	offload/stream.cpp
 	util/cpu_affinity.cpp)
 
 # TCP
diff --git a/src/test_streams.cpp b/src/test_streams.cpp
index a5bfb5d..248e52a 100644
--- a/src/test_streams.cpp
+++ b/src/test_streams.cpp
@@ -19,7 +19,7 @@ struct MyData {
 	template<class Archive>
 	void serialize(Archive & archive)
 	{
-		archive( one, two );
+		archive( cereal::binary_data( one, sizeof(char)*1024), cereal::binary_data( two, sizeof(char)*1024));
 	}
 };
 
@@ -28,7 +28,7 @@ struct MyData {
 ham::offload::stream::stream_proxy offloaded_fun(ham::offload::stream::stream_proxy osp)
 {
 	ham::offload::stream::istream his(osp); // NOTE: data is already on the target
-
+	
     MyData m1, m2, m3;
     {
         cereal::BinaryInputArchive iarchive(his); // Create an input archive
@@ -36,17 +36,31 @@ ham::offload::stream::stream_proxy offloaded_fun(ham::offload::stream::stream_pr
 		iarchive(m1, m2, m3); // Read the data from the archive
 	}
 
+	printf("tin: 1.1 %.10s\n", m1.one);
+	printf("tin: 1.2 %.10s\n", m1.two);
+	printf("tin: 2.1 %.10s\n", m2.one);
+	printf("tin: 2.2 %.10s\n", m2.two);
+	printf("tin: 3.1 %.10s\n", m3.one);
+	printf("tin: 3.2 %.10s\n", m3.two);
+
 	char* bla = "0123456789";
 	strcpy(m1.one, bla);
 	strcpy(m1.two, bla);
 	char* blub = "ABCDEFGHI";
 	strcpy(m2.one, blub);
 	strcpy(m2.two, blub);
-	strcpy(m2.one, bla);
-	strcpy(m2.two, blub);
+	strcpy(m3.one, bla);
+	strcpy(m3.two, blub);
 
+	printf("tout: 1.1 %.10s\n", m1.one);
+	printf("tout: 1.2 %.10s\n", m1.two);
+	printf("tout: 2.1 %.10s\n", m2.one);
+	printf("tout: 2.2 %.10s\n", m2.two);
+	printf("tout: 3.1 %.10s\n", m3.one);
+	printf("tout: 3.2 %.10s\n", m3.two);
 
 	ham::offload::stream::ostream hos(0);
+	
 
     {
         cereal::BinaryOutputArchive oarchive(hos);
@@ -63,9 +77,26 @@ int main(int argc, char* argv[])
 	ham::offload::node_t target = 1;
 
 	ham::offload::stream::ostream hos(target);
+	
 
 	MyData m1, m2, m3; // could be out of scope, data to be transferred
 
+	char* bla = "9876543210";
+	strcpy(m1.one, bla);
+	strcpy(m1.two, bla);
+	char* blub = "IHGFEDCBA";
+	strcpy(m2.one, blub);
+	strcpy(m2.two, blub);
+	strcpy(m3.one, bla);
+	strcpy(m3.two, blub);
+	
+	printf("hout: 1.1 %.10s\n", m1.one);
+	printf("hout: 1.2 %.10s\n", m1.two);
+	printf("hout: 2.1 %.10s\n", m2.one);
+	printf("hout: 2.2 %.10s\n", m2.two);
+	printf("hout: 3.1 %.10s\n", m3.one);
+	printf("hout: 3.2 %.10s\n", m3.two);
+
 	{
 		cereal::BinaryOutputArchive oarchive(hos); // Create an output archive
 		oarchive(m1, m2, m3); // Write the data to the archive
@@ -76,19 +107,19 @@ int main(int argc, char* argv[])
 
 	auto in_proxy = ham::offload::sync(target, f2f(&offloaded_fun, out_proxy));
 
-    ham::offload::stream::istream his(in_proxy);
+    	ham::offload::stream::istream his(in_proxy);
+	
 
 	{
 		cereal::BinaryInputArchive iarchive(his);
 		iarchive(m1, m2, m3);
 	}
-
-	printf("%.10s\n", m1.one);
-	printf("%.10s\n", m1.two);
-	printf("%.10s\n", m2.one);
-	printf("%.10s\n", m2.two);
-	printf("%.10s\n", m3.one);
-	printf("%.10s\n", m3.two);
+	printf("hin: 1.1 %.10s\n", m1.one);
+	printf("hin: 1.2 %.10s\n", m1.two);
+	printf("hin: 2.1 %.10s\n", m2.one);
+	printf("hin: 2.2 %.10s\n", m2.two);
+	printf("hin: 3.1 %.10s\n", m3.one);
+	printf("hin: 3.2 %.10s\n", m3.two);
 	return 0;	
 }
 

From 3206fffe3406ea9a38d9d7aab20ec9aedc5de276 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Sat, 8 Jun 2019 14:38:23 +0200
Subject: [PATCH 148/150] cleanup

---
 ...communicator_mpi_rma_dynamic_data_only.hpp | 400 ++++++++++++++++++
 1 file changed, 400 insertions(+)
 create mode 100644 include/ham/net/communicator_mpi_rma_dynamic_data_only.hpp

diff --git a/include/ham/net/communicator_mpi_rma_dynamic_data_only.hpp b/include/ham/net/communicator_mpi_rma_dynamic_data_only.hpp
new file mode 100644
index 0000000..4dff738
--- /dev/null
+++ b/include/ham/net/communicator_mpi_rma_dynamic_data_only.hpp
@@ -0,0 +1,400 @@
+// Copyright (c) 2013-2014 Matthias Noack (ma.noack.pr@gmail.com)
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef ham_net_communicator_mpi_rma_dynamic_hpp
+#define ham_net_communicator_mpi_rma_dynamic_hpp
+
+#include <mpi.h>
+
+#include <cassert>
+#include <cstring> // memcpy
+#include <stdlib.h> // posix_memalign
+
+#include "ham/misc/constants.hpp"
+#include "ham/misc/resource_pool.hpp"
+#include "ham/misc/types.hpp"
+#include "ham/util/debug.hpp"
+#include "ham/util/log.hpp"
+
+namespace ham {
+namespace net {
+
+template<typename T>
+class buffer_ptr {
+public:
+	buffer_ptr();
+    buffer_ptr(T* ptr, node_t node) : ptr_(ptr), node_(node), mpi_address_(0) { }
+	buffer_ptr(T* ptr, node_t node, MPI_Aint mpi_address) : ptr_(ptr), node_(node), mpi_address_(mpi_address) { }
+
+
+	T* get() { return ptr_; }
+	node_t node() { return node_; }
+    	MPI_Aint get_mpi_address() { return mpi_address_; }
+
+    	// element access
+	T& operator [] (size_t i);
+
+	// basic pointer arithmetic to address sub-buffers
+	buffer_ptr<T> operator+(size_t off)
+	{
+		return buffer_ptr(ptr_ + off, node_);
+	}
+
+private:
+	T* ptr_;
+	node_t node_;
+    MPI_Aint mpi_address_;
+};
+
+class node_descriptor
+{
+public:
+	//node_descriptor() : name(MPI_MAX_PROCESSOR_NAME, 0) {}
+
+	//const std::string& name() const { return name_; }
+	const char* name() const { return name_; }
+private:
+	//std::string name_; // TODO(improvement): unify node description for all back-ends, NOTE: std::string is not trivally transferable
+	char name_[MPI_MAX_PROCESSOR_NAME + 1];
+
+	friend class net::communicator;
+};
+
+class communicator {
+public:
+	// externally used interface of request must be shared across all communicator-implementations
+	class request {
+	public:
+		request() : valid_(false) {} // instantiate invalid
+		
+		request(node_t target_node, node_t source_node, size_t send_buffer_index, size_t recv_buffer_index)
+		 : target_node(target_node), source_node(source_node), valid_(true), send_buffer_index(send_buffer_index), recv_buffer_index(recv_buffer_index), req_count(0), uses_rma_(false)
+		{}
+
+		// return true if request was finished
+        	// will not work as intended for rma ops, no equivalent to test() available for remote completion
+		bool test()
+		{
+			int flag = 0;
+			MPI_Testall(req_count, mpi_reqs, &flag, MPI_STATUS_IGNORE); // just test the receive request, since the send belonging to the request triggers the remote send that is received
+
+            if(uses_rma_)
+            {
+                HAM_DEBUG( HAM_LOG << "request::test(), warning: may give false positive on rma remote completion" << std::endl; )
+            }
+
+            return flag != 0;
+		}
+
+		void* get() // blocks
+		{
+			HAM_DEBUG( HAM_LOG << "request::get(), before MPI_Waitall()" << std::endl; )
+			MPI_Waitall(req_count, mpi_reqs, MPI_STATUS_IGNORE); // must wait for all requests to satisfy the standard
+			HAM_DEBUG( HAM_LOG << "request::get(), after MPI_Waitall()" << std::endl; )
+            if(uses_rma_)
+            {
+                MPI_Win_flush(target_node, communicator::instance().peers[target_node].rma_win);
+            }
+			return static_cast<void*>(&communicator::instance().peers[target_node].msg_buffers[recv_buffer_index]);
+		}
+
+		template<class T>
+		void send_result(T* result_msg, size_t size)
+		{
+			assert(communicator::this_node() == target_node); // this assert fails if send_result is called from the wrong side
+			
+			// TODO(improvement, low priority): better go through communicator, such that no MPI calls are anywhere else
+			MPI_Send(result_msg, size, MPI_BYTE, source_node, constants::RESULT_TAG, MPI_COMM_WORLD);
+		}
+
+		bool valid() const
+		{
+			return valid_;
+		}
+
+        bool uses_rma() const
+        {
+            return uses_rma_;
+        }
+
+		MPI_Request& next_mpi_request()
+		{
+			HAM_DEBUG( HAM_LOG << "next_mpi_request(): this=" << this << ", req_count=" << req_count << ", NUM_REQUESTS=" << NUM_REQUESTS << std::endl; )
+			assert(req_count < NUM_REQUESTS);
+			return mpi_reqs[req_count++]; // NOTE: post-increment
+		}
+
+		node_t target_node;
+		node_t source_node;
+		bool valid_;
+        bool uses_rma_;
+
+		// only needed by the sender
+		enum { NUM_REQUESTS = 3 };
+		
+		size_t send_buffer_index; // buffer to use for sending the message
+		size_t recv_buffer_index; // buffer to use for receiving the result
+		size_t req_count;
+		
+	private:
+		MPI_Request mpi_reqs[NUM_REQUESTS]; // for sending the msg, receiving the result, and an associated data transfer
+	}; // class request
+
+	typedef request& request_reference_type;
+	typedef const request& request_const_reference_type;
+
+	communicator(int argc, char* argv[])
+	{
+		HAM_DEBUG( std::cout << "communicator::communicator(): initialising MPI" << std::endl; )
+
+		instance_ = this;
+		int p;
+		MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &p);
+		if (p != MPI_THREAD_MULTIPLE)
+		{
+			std::cerr << "Could not initialise MPI with MPI_THREAD_MULTIPLE, MPI_Init_thread() returned " << p << std::endl;
+		}
+		HAM_DEBUG( std::cout << "communicator::communicator(): initialising MPI ..." << std::endl; )
+
+		int t;
+		MPI_Comm_rank(MPI_COMM_WORLD, &t);
+		this_node_ = t;
+		MPI_Comm_size(MPI_COMM_WORLD, &t);
+		nodes_ = t;
+		host_node_ = 0; // TODO(improvement): make configureable, like for SCIF
+
+		HAM_DEBUG( std::cout << "communicator::communicator(): initialising MPI done" << std::endl; )
+
+		peers = new mpi_peer[nodes_];
+		
+		// start of node descriptor code:
+		node_descriptions.resize(nodes_);
+		
+		// build own node descriptor
+		node_descriptor node_description;
+		int count;
+		MPI_Get_processor_name(node_description.name_, &count);
+		node_description.name_[count] = 0x0; // null terminate
+
+		// communicate descriptors between nodes
+		HAM_DEBUG( HAM_LOG << "communicator::communicator(): gathering node descriptions" << std::endl; )
+		MPI_Allgather(&node_description, sizeof(node_descriptor), MPI_BYTE, node_descriptions.data(), sizeof(node_descriptor), MPI_BYTE, MPI_COMM_WORLD);
+		HAM_DEBUG( HAM_LOG << "communicator::communicator(): gathering node descriptions done" << std::endl; )
+
+
+        if (is_host()) {
+
+            for (node_t i = 1; i < nodes_; ++i) { // TODO(improvement): needs to be changed when host-rank becomes configurable
+                // allocate buffers
+                peers[i].msg_buffers = allocate_peer_buffer<msg_buffer>(constants::MSG_BUFFERS, this_node_);
+                // fill resource pools
+                for (size_t j = constants::MSG_BUFFERS; j > 0; --j) {
+                    peers[i].buffer_pool.add(j - 1);
+                }
+            }
+        }
+
+        // initialise 1 global window per target for data
+        for (node_t i = 1; i < nodes_; ++i) {
+            MPI_Win_create_dynamic(MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[i].rma_win));
+        }
+
+	// get all locks to targets
+        // targets lock to other targets for copies
+        for (node_t i = 1; i < nodes_; ++i) { // TODO(improvement): needs to be changed when host-rank becomes configurable
+            if(i != this_node_) {
+                MPI_Win_lock(MPI_LOCK_SHARED, i, 0, peers[i].rma_win);  // shared locks because all ranks lock on every target concurrently
+            }
+        }
+
+        HAM_DEBUG( HAM_LOG << "communicator::communicator(): rma window creation done" << std::endl; )
+
+	}
+
+	~communicator()
+	{
+		MPI_Finalize(); // TODO(improvement): check on error and create output if there was one
+		HAM_DEBUG( HAM_LOG << "~communicator" << std::endl; )
+	}
+
+
+	request allocate_request(node_t remote_node)
+	{
+		HAM_DEBUG( HAM_LOG << "communicator::allocate_next_request(): remote_node = " << remote_node << std::endl; )
+
+		const size_t send_buffer_index = peers[remote_node].buffer_pool.allocate();
+		const size_t recv_buffer_index = peers[remote_node].buffer_pool.allocate();
+
+		return { remote_node, this_node_, send_buffer_index, recv_buffer_index };
+	}
+
+	void free_request(request& req)
+	{
+		assert(req.valid());
+		assert(req.source_node == this_node_);
+	
+		mpi_peer& peer = peers[req.target_node];
+
+		peer.buffer_pool.free(req.send_buffer_index);
+		peer.buffer_pool.free(req.recv_buffer_index);
+		req.valid_ = false;
+	}
+
+public:
+	void send_msg(request_reference_type req, void* msg, size_t size)
+	{
+		// copy message from caller into transfer buffer
+		void* msg_buffer = static_cast<void*>(&peers[req.target_node].msg_buffers[req.send_buffer_index]);
+		memcpy(msg_buffer, msg, size);
+		MPI_Isend(msg_buffer, size, MPI_BYTE, req.target_node, constants::DEFAULT_TAG, MPI_COMM_WORLD, &req.next_mpi_request());
+	}
+	
+	// to be used by the offload target's main loop: synchronously receive one message at a time
+	// NOTE: the local static receive buffer!
+	void* recv_msg_host(void* msg = nullptr, size_t size = constants::MSG_SIZE)
+	{
+		static msg_buffer buffer; // NOTE !
+		MPI_Recv(&buffer, size, MPI_BYTE, host_node_, constants::DEFAULT_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
+        return static_cast<void*>(&buffer);
+	}
+
+	// trigger receiving the result of a message on the sending side
+	void recv_result(request_reference_type req)
+	{
+		// nothing todo here, since this communicator implementation uses one-sided communication
+		// the data is already where it is expected (in the buffer referenced in req)
+		MPI_Irecv(static_cast<void*>(&peers[req.target_node].msg_buffers[req.recv_buffer_index]), constants::MSG_SIZE, MPI_BYTE, req.target_node, constants::RESULT_TAG, MPI_COMM_WORLD, &req.next_mpi_request());
+		return;
+	}
+
+	// in MPI RMA backend only used by copy
+	// host uses async version
+	// targets don't send data to host as host uses rma get
+	template<typename T>
+	void send_data(T* local_source, buffer_ptr<T> remote_dest, size_t size)
+	{
+		// execute transfer
+		MPI_Put(local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), remote_dest.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_dest.node()].rma_win);
+        	MPI_Win_flush(remote_dest.node(), peers[remote_dest.node()].rma_win);
+	}
+
+	// to be used by the host only
+	template<typename T>
+	void send_data_async(request_reference_type req, T* local_source, buffer_ptr<T> remote_dest, size_t size)
+	{
+        req.uses_rma_ = true;
+
+        // MPI_Win_lock(MPI_LOCK_SHARED, remote_dest.node(), 0, peers[remote_dest.node()].rma_win);
+        MPI_Rput(local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), remote_dest.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_dest.node()].rma_win, &req.next_mpi_request());
+	}
+
+	// not used in MPI RMA backend
+	// host uses async version
+	// targets don't use get
+	// should be safe to remove
+	template<typename T>
+	void recv_data(buffer_ptr<T> remote_source, T* local_dest, size_t size)
+	{
+		MPI_Get(remote_source, size * sizeof(T), MPI_BYTE, remote_source.node(), remote_source.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_source.node()].rma_win);
+		MPI_Win_flush(remote_source.node(), peers[remote_source.node()].rma_win);
+	}
+	
+	// to be used by the host
+	template<typename T>
+	void recv_data_async(request_reference_type req, buffer_ptr<T> remote_source, T* local_dest, size_t size)
+	{
+        req.uses_rma_ = true;
+
+		MPI_Rget(local_dest, size * sizeof(T), MPI_BYTE, remote_source.node(), remote_source.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_source.node()].rma_win, &req.next_mpi_request());
+	}
+
+	template<typename T>
+	buffer_ptr<T> allocate_buffer(const size_t n, node_t source_node)
+	{
+		T* ptr;
+		posix_memalign((void**)&ptr, constants::CACHE_LINE_SIZE, n * sizeof(T));
+        // attach to own window
+        MPI_Win_attach(peers[this_node_].rma_win, (void*)ptr, n * sizeof(T));
+        	MPI_Aint mpi_address;
+		MPI_Get_address((void*)ptr, &mpi_address);
+		// NOTE: no ctor is called
+		return buffer_ptr<T>(ptr, this_node_, mpi_address);
+	}
+
+	// for host to allocate peer message buffers, needed because original function now manages rma window which must not happen for host-only local buffers
+	template<typename T>
+	buffer_ptr<T> allocate_peer_buffer(const size_t n, node_t source_node)
+	{
+		T* ptr;
+		posix_memalign((void**)&ptr, constants::CACHE_LINE_SIZE, n * sizeof(T));
+		// NOTE: no ctor is called
+		return buffer_ptr<T>(ptr, this_node_);
+	}
+
+	template<typename T>
+	void free_buffer(buffer_ptr<T> ptr)
+	{
+		assert(ptr.node() == this_node_);
+		// NOTE: no dtor is called
+        	// remove from own rma window
+        	MPI_Win_detach(peers[this_node_].rma_win, ptr.get());
+		free(static_cast<void*>(ptr.get()));
+	}
+
+    	// for host to free peer message buffers, needed because original function now manages rma window which must not happen for host-only local buffers
+	template<typename T>
+	void free_peer_buffer(buffer_ptr<T> ptr)
+	{
+		assert(ptr.node() == this_node_);
+		// NOTE: no dtor is called
+		free(static_cast<void*>(ptr.get()));
+	}
+
+	static communicator& instance() { return *instance_; }
+	static node_t this_node() { return instance().this_node_; }
+	static size_t num_nodes() { return instance().nodes_; }
+	bool is_host() { return this_node_ == 0; } // TODO(improvement): ham_address == ham_host_address ; }
+	bool is_host(node_t node) { return node == 0; } // TODO(improvement): node == ham_host_address; }
+
+	static const node_descriptor& get_node_description(node_t node)
+	{
+		return instance().node_descriptions[node];
+	}
+
+private:
+	static communicator* instance_;
+	node_t this_node_;
+	size_t nodes_;
+	node_t host_node_;
+	std::vector<node_descriptor> node_descriptions; // not as member in peer below, because Allgather is used to exchange node descriptions
+
+	struct mpi_peer {
+		buffer_ptr<msg_buffer> msg_buffers; // buffers used for MPI_ISend and IRecv by the sender
+
+		// needed by sender to manage which buffers are in use and which are free
+		// just manages indices, that can be used by
+		detail::resource_pool<size_t> buffer_pool;
+
+		// mpi rma dynamic window for data transfers
+		MPI_Win rma_win;
+	};
+	
+	mpi_peer* peers;
+};
+
+template<typename T>
+buffer_ptr<T>::buffer_ptr() : buffer_ptr(nullptr, communicator::this_node()) { }
+
+template<typename T>
+T& buffer_ptr<T>::operator[](size_t i)
+{
+	assert(node_ == communicator::this_node());
+	return ptr_[i];
+}
+
+} // namespace net
+} // namespace ham
+
+#endif // ham_net_communicator_mpi_hpp

From fc47b02a9f4c396be707c9c9142bdc081e859055 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Sat, 8 Jun 2019 16:41:42 +0200
Subject: [PATCH 149/150] cleanup

---
 src/benchmark_streams.cpp                     | 168 ++++++++++++++++++
 ...communicator_mpi_rma_dynamic_data_only.cpp |   9 +
 2 files changed, 177 insertions(+)
 create mode 100644 src/benchmark_streams.cpp
 create mode 100644 src/ham/net/communicator_mpi_rma_dynamic_data_only.cpp

diff --git a/src/benchmark_streams.cpp b/src/benchmark_streams.cpp
new file mode 100644
index 0000000..049c4b8
--- /dev/null
+++ b/src/benchmark_streams.cpp
@@ -0,0 +1,168 @@
+// Copyright (c) 2013-2014 Matthias Noack (ma.noack.pr@gmail.com)
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include "ham/offload.hpp"
+#include "ham/offload/stream.hpp"
+
+#include <boost/program_options.hpp>
+
+#include "cereal/archives/binary.hpp"
+
+#include "ham/util/time.hpp"
+
+#include <array>
+#include <iostream>
+
+using namespace std;
+using namespace ham::util::time;
+using namespace ham;
+
+// this is set in main. locally for host, through offload for target
+// it is ugly, but is used to remove the allocation of the user-buffer from the benchmarked time,
+// because we want to measure the overhead of the streaming abstraction, not how long it takes to instantiate user data
+class cheese {
+public:
+	static char* d1;
+	static size_t cheese_size;
+};
+char* cheese::d1 = nullptr;
+size_t cheese::cheese_size = 0;
+
+void set_cheese(size_t size) {
+	posix_memalign((void**)&cheese::d1, constants::CACHE_LINE_SIZE, size);
+	cheese::cheese_size = size;
+}
+
+ham::offload::stream::stream_proxy offloaded_fun(ham::offload::stream::stream_proxy osp)
+{
+	ham::offload::stream::istream his(osp);
+
+    {
+        cereal::BinaryInputArchive iarchive(his);
+
+		iarchive(cereal::binary_data(cheese::d1, sizeof(char)*cheese::cheese_size));
+	}
+
+	//if(cheese::d1[1337] == 'a') cheese::d1[1337] = 'b';
+	ham::offload::stream::ostream hos(0, cheese::cheese_size);
+    {
+        cereal::BinaryOutputArchive oarchive(hos);
+        oarchive(cereal::binary_data(cheese::d1, sizeof(char)*cheese::cheese_size));
+    }
+	auto out_proxy = hos.sync();
+    return out_proxy;
+}
+
+int main(int argc, char* argv[])
+{
+	// option defaults
+	unsigned int warmup_runs = 1;
+	unsigned int runs = 1000;
+	size_t data_size = 1024*1024;
+
+	// command line options
+	boost::program_options::options_description desc("Supported options");
+	desc.add_options()
+			("help,h", "Shows this message")
+			("runs,r", boost::program_options::value(&runs)->default_value(runs), "number of identical inner runs for which the average time will be computed")
+			("warmup-runs", boost::program_options::value(&warmup_runs)->default_value(warmup_runs), "number of number of additional warmup runs before times are measured")
+			("size,s", boost::program_options::value(&data_size)->default_value(data_size), "size of transferred data in byte (multiple of 4)")
+			;
+
+	boost::program_options::variables_map vm;
+
+	boost::program_options::store(boost::program_options::command_line_parser(argc, argv).options(desc).allow_unregistered().run(), vm);
+	boost::program_options::notify(vm);
+
+	ham::offload::node_t target = 1;
+
+	// used to avoid benchmarking memory allocation for the target object on the target side
+	set_cheese(data_size);
+	ham::offload::ping(target, f2f(&set_cheese, data_size));
+
+	statistics comp_time(runs, warmup_runs);
+	statistics put_time(runs, warmup_runs);
+	statistics call_time(runs, warmup_runs);
+	statistics get_time(runs, warmup_runs);
+	statistics os_time(runs, warmup_runs);
+	statistics is_time(runs, warmup_runs);
+
+	for (int i = 0; i < (runs + warmup_runs) ; ++i) {
+		//cheese::d1[1337] = 'a';
+		timer comp;
+		ham::offload::stream::ostream hos(target, cheese::cheese_size);
+		timer ost;
+		{
+			cereal::BinaryOutputArchive oarchive(hos);
+			oarchive(cereal::binary_data(cheese::d1, sizeof(char)*data_size)); //sizeof(char)*data_size)
+		}
+		os_time.add(ost);
+		timer put;
+		auto out_proxy = hos.sync();
+		put_time.add(put);
+		timer call;
+		auto in_proxy = ham::offload::sync(target, f2f(&offloaded_fun, out_proxy));
+		call_time.add(call);
+		timer get;
+		ham::offload::stream::istream his(in_proxy);
+		get_time.add(get);
+		timer ist;
+		{
+			cereal::BinaryInputArchive iarchive(his);
+			iarchive(cereal::binary_data(cheese::d1, sizeof(char)*data_size));
+		}
+		is_time.add(ist);
+		comp_time.add(comp);
+		//assert(cheese::d1[1337] == 'b');
+	}
+
+	std::string header_string = "name\t" + statistics::header_string() + "\tdata_size";
+
+	cout << endl <<"HAM-Offload stream overall: " << endl
+	     << header_string << endl
+	     << "stream:\t" << comp_time.string() << "\t" << data_size << endl << endl;
+	cout << "HAM-Offload streams ostream: " << endl
+	     << header_string << endl
+	     << "stream:\t" << os_time.string() << "\t" << data_size << endl << endl;
+	cout << "HAM-Offload streams copy-in: " << endl
+	     << header_string << endl
+	     << "stream:\t" << put_time.string() << "\t" << data_size << endl << endl;
+	cout << "HAM-Offload streamed call: " << endl
+	     << header_string << endl
+	     << "stream:\t" << call_time.string() << "\t" << data_size << endl << endl;
+	cout << "HAM-Offload streamed copy-out: " << endl
+	     << header_string << endl
+	     << "stream:\t" << get_time.string() << "\t" << data_size << endl << endl;
+	cout << "HAM-Offload streamed istream: " << endl
+	     << header_string << endl
+	     << "stream:\t" << is_time.string() << "\t" << data_size << endl << endl;
+
+
+	statistics str_time(1, 0);
+
+	ham::offload::stream::ostream hos(target, cheese::cheese_size);
+	{
+		cereal::BinaryOutputArchive oarchive(hos);
+		oarchive(cereal::binary_data(cheese::d1, sizeof(char)*data_size)); //sizeof(char)*data_size)
+	}
+	timer str_tim;
+
+		string tmp = hos.rdbuf()->str();
+
+	str_time.add(str_tim);
+	statistics cpy_time(1, 0);
+	statistics cstr_time(1, 0);
+	timer cpy_tim;
+	memcpy((void *) cheese::d1, tmp.c_str(), cheese::cheese_size);
+	cpy_time.add(cpy_tim);
+	timer cstr_tim;
+	const char* asdf = tmp.c_str();
+	cstr_time.add(cstr_tim);
+	cout << str_time.string() << endl;
+	cout << cpy_time.string() << endl;
+	cout << cstr_time.string() << endl;
+	return 0;
+}
+
diff --git a/src/ham/net/communicator_mpi_rma_dynamic_data_only.cpp b/src/ham/net/communicator_mpi_rma_dynamic_data_only.cpp
new file mode 100644
index 0000000..e4e5dbd
--- /dev/null
+++ b/src/ham/net/communicator_mpi_rma_dynamic_data_only.cpp
@@ -0,0 +1,9 @@
+// Copyright (c) 2013-2014 Matthias Noack (ma.noack.pr@gmail.com)
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include "ham/net/communicator.hpp"
+
+ham::net::communicator* ham::net::communicator::instance_ = nullptr;
+

From 341523605f3134e7d74c0f80d96c5e55f123e3e6 Mon Sep 17 00:00:00 2001
From: Daniel Deppisch <deppisch@zib.de>
Date: Sat, 8 Jun 2019 16:46:48 +0200
Subject: [PATCH 150/150] cleanup

---
 include/ham/net/communicator.hpp              |   4 +-
 .../ham/net/communicator_mpi_rma_dynamic.hpp  | 552 +++++++-----------
 ...communicator_mpi_rma_dynamic_data_only.hpp | 106 ++--
 include/ham/offload/offload.hpp               |  16 +-
 include/ham/offload/offload_msg.hpp           | 100 ++--
 src/CMakeLists.txt                            |  40 +-
 src/ham/CMakeLists.txt                        |  37 +-
 7 files changed, 375 insertions(+), 480 deletions(-)

diff --git a/include/ham/net/communicator.hpp b/include/ham/net/communicator.hpp
index 390279a..ea410f8 100644
--- a/include/ham/net/communicator.hpp
+++ b/include/ham/net/communicator.hpp
@@ -58,10 +58,12 @@ namespace net {
 #include "ham/net/communicator_scif.hpp"
 #elif defined HAM_COMM_MPI_RMA_DYNAMIC
 #include "ham/net/communicator_mpi_rma_dynamic.hpp"
+#elif defined HAM_COMM_MPI_RMA_DYNAMIC_DATA_ONLY
+#include "ham/net/communicator_mpi_rma_dynamic_data_only.hpp"
 #elif defined HAM_COMM_TCP
 #include "ham/net/communicator_tcp.hpp"
 #else
-static_assert(false, "Please define either HAM_COMM_MPI, HAM_COMM_MPI_RMA_DYNAMIC or HAM_COMM_SCIF.");
+static_assert(false, "Please define either HAM_COMM_MPI, HAM_COMM_SCIF, HAM_COMM_MPI_RMA_DYNAMIC, HAM_COMM_MPI_RMA_DYNAMIC_DATA_ONLY or HAM_COMM_TCP");
 #endif
 
 #endif // ham_net_communicator_hpp
diff --git a/include/ham/net/communicator_mpi_rma_dynamic.hpp b/include/ham/net/communicator_mpi_rma_dynamic.hpp
index 4c4bb65..e74c1ba 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic.hpp
@@ -17,7 +17,6 @@
 #include "ham/misc/types.hpp"
 #include "ham/util/debug.hpp"
 #include "ham/util/log.hpp"
-// #include "ham/util/time.hpp"
 #include "communicator.hpp"
 
 namespace ham {
@@ -27,15 +26,15 @@ template<typename T>
 class buffer_ptr {
 public:
 	buffer_ptr();
-    buffer_ptr(T* ptr, node_t node) : ptr_(ptr), node_(node), mpi_address_(0) { }
+	buffer_ptr(T* ptr, node_t node) : ptr_(ptr), node_(node), mpi_address_(0) { }
 	buffer_ptr(T* ptr, node_t node, MPI_Aint mpi_address) : ptr_(ptr), node_(node), mpi_address_(mpi_address) { }
 
 
 	T* get() { return ptr_; }
 	node_t node() { return node_; }
-    MPI_Aint get_mpi_address() { return mpi_address_; }
+		MPI_Aint get_mpi_address() { return mpi_address_; }
 
-    // element access
+		// element access
 	T& operator [] (size_t i);
 
 	// basic pointer arithmetic to address sub-buffers
@@ -47,15 +46,12 @@ class buffer_ptr {
 private:
 	T* ptr_;
 	node_t node_;
-    MPI_Aint mpi_address_;
+		MPI_Aint mpi_address_;
 };
 
 class node_descriptor
 {
 public:
-	//node_descriptor() : name(MPI_MAX_PROCESSOR_NAME, 0) {}
-
-	//const std::string& name() const { return name_; }
 	const char* name() const { return name_; }
 private:
 	//std::string name_; // TODO(improvement): unify node description for all back-ends, NOTE: std::string is not trivally transferable
@@ -67,11 +63,11 @@ class node_descriptor
 class communicator {
 public:
 	enum {
-        NO_BUFFER_INDEX = constants::MSG_BUFFERS, // invalid buffer index (max valid + 1)
-        FLAG_FALSE = constants::MSG_BUFFERS + 1 // special value, outside normal index range
-    };
+	NO_BUFFER_INDEX = constants::MSG_BUFFERS, // invalid buffer index (max valid + 1)
+	FLAG_FALSE = constants::MSG_BUFFERS + 1 // special value, outside normal index range
+	};
 
-    // externally used interface of request must be shared across all communicator-implementations
+	// externally used interface of request must be shared across all communicator-implementations
 	class request {
 	public:
 		request() : valid_(false) {} // instantiate invalid
@@ -81,38 +77,37 @@ class communicator {
 		{}
 
 		// return true if request was finished
-        // will not work as intended for rma ops, no equivalent to test() available for remote completion
+		// will not work as intended for rma ops, no equivalent to test() available for remote completion
 		bool test()
 		{
-            if(data_transfer_type) { // this will be true for rma data transfers
-                int flag = 0;
-                MPI_Testall(req_count, mpi_reqs, &flag, MPI_STATUS_IGNORE); // test on RGET is what we want, because local completion = full completion for get, but for RPut local is not enough and there is no non-blocking remote-completion test
-                HAM_DEBUG( HAM_LOG << "request::test(), warning: may give false positive on rma put remote completion" << std::endl; )
-                // TODO - Daniel: this is bad but MPI RMA doesn't have anything better
-                // TODO - Daniel: discuss preliminary design decision with Matthias: false positive + longer block = better than false negative as users may poll on this and get stuck
-                return flag != 0;
-            }
-            return communicator::instance().test_local_flag(target_node, local_buffer_index);
+				if(data_transfer_type) { // this will be true for rma data transfers
+				int flag = 0;
+				MPI_Testall(req_count, mpi_reqs, &flag, MPI_STATUS_IGNORE); // test on RGET is what we want, because local completion = full completion for get, but for RPut local is not enough and there is no non-blocking remote-completion test
+				HAM_DEBUG( HAM_LOG << "request::test(), warning: may give false positive on rma put remote completion" << std::endl; )
+				// TODO - Daniel: this is bad but MPI RMA doesn't have anything better
+				return flag != 0;
+				}	
+			return communicator::instance().test_local_flag(target_node, local_buffer_index);
 		}
 
 		void* get() // blocks
 		{
-            if(data_transfer_type)  {
-                HAM_DEBUG( HAM_LOG << "request::get(), before MPI_Waitall()" << std::endl; )
-                MPI_Waitall(req_count, mpi_reqs, MPI_STATUS_IGNORE); // Get will have fully completed
-                HAM_DEBUG( HAM_LOG << "request::get(), after MPI_Waitall()" << std::endl; )
-                if(data_transfer_type == constants::DATA_PUT_CODE) {
-                    HAM_DEBUG( HAM_LOG << "request::get(), before MPI_Win_flush()" << std::endl; )
-                    communicator::instance().flush_data(target_node);
-                    HAM_DEBUG( HAM_LOG << "request::get(), after MPI_Win_flush()" << std::endl; )
-                }
-                // this will only be true for async rma GETs
-                // there will be no result returned, so this won't poll on anything and return a dummy instead.
-                return nullptr;
-                // TODO - Daniel: this is bad but MPI RMA doesn't have anything better
-            } else {
-                return communicator::instance().recv_msg(target_node, local_buffer_index);
-            }
+			if(data_transfer_type)  {
+				HAM_DEBUG( HAM_LOG << "request::get(), before MPI_Waitall()" << std::endl; )
+				MPI_Waitall(req_count, mpi_reqs, MPI_STATUS_IGNORE); // Get will have fully completed
+				HAM_DEBUG( HAM_LOG << "request::get(), after MPI_Waitall()" << std::endl; )
+				if(data_transfer_type == constants::DATA_PUT_CODE) {
+					HAM_DEBUG( HAM_LOG << "request::get(), before MPI_Win_flush()" << std::endl; )
+					communicator::instance().flush_data(target_node);
+					HAM_DEBUG( HAM_LOG << "request::get(), after MPI_Win_flush()" << std::endl; )
+				}
+				// this will only be true for async rma GETs
+				// there will be no result returned, so this won't poll on anything and return a dummy instead.
+				return nullptr;
+				// TODO - Daniel: this is bad but MPI RMA doesn't have anything better
+			} else {
+				return communicator::instance().recv_msg(target_node, local_buffer_index);
+			}
 		}
 
 		template<class T>
@@ -120,8 +115,6 @@ class communicator {
 		{
 			assert(communicator::this_node() == target_node); // this assert fails if send_result is called from the wrong side
 			
-			// TODO(improvement, low priority): better go through communicator, such that no MPI calls are anywhere else
-			// MPI_Send(result_msg, size, MPI_BYTE, source_node, constants::RESULT_TAG, MPI_COMM_WORLD);
 			communicator::instance().send_msg(source_node, local_buffer_index, NO_BUFFER_INDEX, result_msg, size);
 		}
 
@@ -130,10 +123,10 @@ class communicator {
 			return valid_;
 		}
 
-        bool is_rma_data_transfer() const
-        {
-            return data_transfer_type;
-        }
+		bool is_rma_data_transfer() const
+		{
+		return data_transfer_type;
+		}
 
 		MPI_Request& next_mpi_request()
 		{
@@ -145,7 +138,7 @@ class communicator {
 		node_t target_node;
 		node_t source_node;
 		bool valid_;
-        short data_transfer_type;
+		short data_transfer_type;
 
 		// only needed by the sender
 		enum { NUM_REQUESTS = 3 };
@@ -194,121 +187,80 @@ class communicator {
 		MPI_Get_processor_name(node_description.name_, &count);
 		node_description.name_[count] = 0x0; // null terminate
 
-//		char hostname[MPI_MAX_PROCESSOR_NAME + 1];
-//		MPI_Get_processor_name(hostname, &count);
-//		hostname[count] = 0x0; // null terminate
-//		node_description.name_.assign(hostname, count);
-
-		// append rank for testing:
-		//node_description.name_[count] = 48 + this_node_;
-		//node_description.name_[count+1] = 0x0;
-
 		// communicate descriptors between nodes
 		HAM_DEBUG( HAM_LOG << "communicator::communicator(): gathering node descriptions" << std::endl; )
-		//MPI_Alltoall(&node_description, sizeof(node_descriptor), MPI_BYTE, node_descriptions.data(), sizeof(node_descriptor), MPI_BYTE, MPI_COMM_WORLD);
 		MPI_Allgather(&node_description, sizeof(node_descriptor), MPI_BYTE, node_descriptions.data(), sizeof(node_descriptor), MPI_BYTE, MPI_COMM_WORLD);
 		HAM_DEBUG( HAM_LOG << "communicator::communicator(): gathering node descriptions done" << std::endl; )
 
-        /*
-        if (is_host()) {
-
-            for (node_t i = 1; i < nodes_; ++i) { // TODO(improvement): needs to be changed when host-rank becomes configurable
-                // allocate buffers
-                peers[i].msg_buffers = allocate_peer_buffer<msg_buffer>(constants::MSG_BUFFERS, this_node_);
-                // fill resource pools
-                for (size_t j = constants::MSG_BUFFERS; j > 0; --j) {
-                    peers[i].buffer_pool.add(j - 1);
-                }
-            }
-        }*/
-
-        // initialise data windows
-        for (node_t i = 0; i < nodes_; ++i) {
-            // dynamic data window
-            MPI_Win_create_dynamic(MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[i].rma_data_win));
-        }
-
-        // initialise message windows
-        for (node_t i = 0; i < nodes_; ++i) { // loop through ranks
-
-            if (i == this_node_) { // create local windows with allocated memory for targets, host creates one inbound set of windows for all targets
-
-                // allocate memory and create windows
-                if (this_node_ == host_node_) { // host creates one large window with subsets associated with different targets
-
-                    // (MSG_SIZE+FLAG_SIZE) * MSG_BUFFERS * num_nodes = bytes of memory allocated (sizes are implicit in msg_flag_buffer struct)
-                    peers[this_node_].msg_flag_data = allocate_peer_buffer<msg_flag_buffer>(constants::MSG_BUFFERS * nodes_, this_node_);
-                    // peers[this_node_].flag_data = allocate_peer_buffer<cache_line_buffer>(constants::MSG_BUFFERS * nodes_, this_node_);
-                    // set flags to FLAG_FALSE
-                    reset_flags(peers[this_node_].msg_flag_data, constants::MSG_BUFFERS * nodes_); // structs are contiguos, this is ok
-
-                    // fill resource pools for managing indices on the host
-                    for (size_t j = 0; j < nodes_; ++j) {
-                        for (size_t k = constants::MSG_BUFFERS; k > 0; --k) {
-                            // target buffers
-                            peers[j].local_buffer_pool.add(k - 1);
-                            peers[j].remote_buffer_pool.add(k - 1);
-                        }
-                        // allocate first next_request,
-                        allocate_next_request(j);
-                    }
-                    // create window with memory
-                    MPI_Win_create((peers[this_node_].msg_flag_data.get()), sizeof(msg_flag_buffer) * constants::MSG_BUFFERS * nodes_, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[this_node_].msg_flag_win));
-                    // MPI_Win_create((peers[this_node_].flag_data.get()), sizeof(cache_line_buffer) * constants::MSG_BUFFERS * nodes_, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[this_node_].flag_win));
-
-                } else { // targets create one window with the size of their msg "queue"
-                    // (MSG_SIZE+FLAG_SIZE) * MSG_BUFFERS = bytes of memory allocated (sizes are implicit in msg_flag_buffer struct)
-                    peers[this_node_].msg_flag_data = allocate_peer_buffer<msg_flag_buffer>(constants::MSG_BUFFERS, this_node_);
-                    // peers[this_node_].flag_data = allocate_peer_buffer<cache_line_buffer>(constants::MSG_BUFFERS, this_node_);
-                    // set flags to FLAG_FALSE
-                    reset_flags(peers[this_node_].msg_flag_data, constants::MSG_BUFFERS);
-
-                    // create window with memory
-                    MPI_Win_create((peers[this_node_].msg_flag_data.get()), sizeof(msg_buffer) * constants::MSG_BUFFERS, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[this_node_].msg_flag_win));
-                    // MPI_Win_create((peers[this_node_].flag_data.get()), sizeof(cache_line_buffer) * constants::MSG_BUFFERS, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[this_node_].flag_win));
-                }
-
-                // debug msg
-                HAM_DEBUG( HAM_LOG << "Rank: " << this_node_ << " in loop run " << i << " created REAL windows..." << std::endl; )
-
-
-            } else { // create remote windows without memory (join the collective call and retreive the window handle)
-
-                MPI_Win_create(nullptr, 0, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[i].msg_flag_win));
-                // MPI_Win_create(nullptr, 0, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[i].flag_win));
-                // debug msg
-                HAM_DEBUG( HAM_LOG << "Rank: " << this_node_ << " in loop run " << i << " creating EMPTY windows..." << std::endl; )
-                //MPI_Win_allocate(0, 1, MPI_INFO_NULL, MPI_COMM_WORLD, peers[i].msg_win_data, &(peers[i].rma_msg_win));
-                //MPI_Win_allocate(0, 1, MPI_INFO_NULL, MPI_COMM_WORLD, peers[i].flag_win_data, &(peers[i].rma_flag_win));
-            }
-        }
+
+		// initialise data windows
+		for (node_t i = 0; i < nodes_; ++i) {
+			// dynamic data window
+			MPI_Win_create_dynamic(MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[i].rma_data_win));
+		}
+
+		// initialise message windows
+		for (node_t i = 0; i < nodes_; ++i) { // loop through ranks
+
+			if (i == this_node_) { // create local windows with allocated memory for targets, host creates one inbound set of windows for all targets
+
+				// allocate memory and create windows
+				if (this_node_ == host_node_) { // host creates one large window with subsets associated with different targets
+
+					// (MSG_SIZE+FLAG_SIZE) * MSG_BUFFERS * num_nodes = bytes of memory allocated (sizes are implicit in msg_flag_buffer struct)
+					peers[this_node_].msg_flag_data = allocate_peer_buffer<msg_flag_buffer>(constants::MSG_BUFFERS * nodes_, this_node_);
+					// peers[this_node_].flag_data = allocate_peer_buffer<cache_line_buffer>(constants::MSG_BUFFERS * nodes_, this_node_);
+					// set flags to FLAG_FALSE
+					reset_flags(peers[this_node_].msg_flag_data, constants::MSG_BUFFERS * nodes_); // structs are contiguos, this is ok
+
+					// fill resource pools for managing indices on the host
+					for (size_t j = 0; j < nodes_; ++j) {
+					for (size_t k = constants::MSG_BUFFERS; k > 0; --k) {
+						// target buffers
+						peers[j].local_buffer_pool.add(k - 1);
+						peers[j].remote_buffer_pool.add(k - 1);
+					}
+					// allocate first next_request,
+					allocate_next_request(j);
+					}
+					// create window with memory
+					MPI_Win_create((peers[this_node_].msg_flag_data.get()), sizeof(msg_flag_buffer) * constants::MSG_BUFFERS * nodes_, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[this_node_].msg_flag_win));
+					// MPI_Win_create((peers[this_node_].flag_data.get()), sizeof(cache_line_buffer) * constants::MSG_BUFFERS * nodes_, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[this_node_].flag_win));
+
+				} else { // targets create one window with the size of their msg "queue"
+					// (MSG_SIZE+FLAG_SIZE) * MSG_BUFFERS = bytes of memory allocated (sizes are implicit in msg_flag_buffer struct)
+					peers[this_node_].msg_flag_data = allocate_peer_buffer<msg_flag_buffer>(constants::MSG_BUFFERS, this_node_);
+					// peers[this_node_].flag_data = allocate_peer_buffer<cache_line_buffer>(constants::MSG_BUFFERS, this_node_);
+					// set flags to FLAG_FALSE
+					reset_flags(peers[this_node_].msg_flag_data, constants::MSG_BUFFERS);
+
+					// create window with memory
+					MPI_Win_create((peers[this_node_].msg_flag_data.get()), sizeof(msg_buffer) * constants::MSG_BUFFERS, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[this_node_].msg_flag_win));
+					// MPI_Win_create((peers[this_node_].flag_data.get()), sizeof(cache_line_buffer) * constants::MSG_BUFFERS, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[this_node_].flag_win));
+				}
+
+				// debug msg
+				HAM_DEBUG( HAM_LOG << "Rank: " << this_node_ << " in loop run " << i << " created REAL windows..." << std::endl; )
+
+			} else { // create remote windows without memory (join the collective call and retreive the window handle)
+
+				MPI_Win_create(nullptr, 0, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[i].msg_flag_win));
+				
+				// debug msg
+				HAM_DEBUG( HAM_LOG << "Rank: " << this_node_ << " in loop run " << i << " creating EMPTY windows..." << std::endl; )
+			}
+	}
 
 		// get all locks to targets for data
-        // targets lock to other targets for copies
-        for (node_t i = 0; i < nodes_; ++i) { // TODO(improvement): needs to be changed when host-rank becomes configurable
-            if (i != this_node_) {
-                MPI_Win_lock(MPI_LOCK_SHARED, i, 0, peers[i].rma_data_win);  // shared locks because all ranks lock on every target concurrently
-            }
-        }
-
-        // MPI_Barrier(MPI_COMM_WORLD);
-
-
-        /* // locking will be done when accessing remote memory
-        // locks for active message rma transfers
-        if (this_node_ != host_node_) { // targets
-            MPI_Win_lock(MPI_LOCK_SHARED, 0, 0, peers[0].msg_win);
-            MPI_Win_lock(MPI_LOCK_SHARED, 0, 0, peers[0].flag_win);
-        } else { // host
-            for (node_t i = 0; i < nodes_; ++i) {
-                MPI_Win_lock(MPI_LOCK_SHARED, i, 0, peers[i].msg_win);
-                MPI_Win_lock(MPI_LOCK_SHARED, i, 0, peers[i].flag_win);
-            }
-        }
-        */
-
-        HAM_DEBUG( HAM_LOG << "communicator::communicator(): rma window creation completed" << std::endl; )
-        HAM_DEBUG( HAM_LOG << "communicator::communicator(): communicator initialization completed" << std::endl; )
+		// targets lock to other targets for copies
+		for (node_t i = 0; i < nodes_; ++i) { // TODO(improvement): needs to be changed when host-rank becomes configurable
+			if (i != this_node_) {
+			MPI_Win_lock(MPI_LOCK_SHARED, i, 0, peers[i].rma_data_win);  // shared locks because all ranks lock on every target concurrently
+		}
+	}
+
+	HAM_DEBUG( HAM_LOG << "communicator::communicator(): rma window creation completed" << std::endl; )
+	HAM_DEBUG( HAM_LOG << "communicator::communicator(): communicator initialization completed" << std::endl; )
 	}
 
 	~communicator()
@@ -317,162 +269,118 @@ class communicator {
 		HAM_DEBUG( HAM_LOG << "~communicator" << std::endl; )
 	}
 
-    // this is only used by the host to manage remote msg buffers and local reply buffers and assign them to requests
-    const request& allocate_next_request(node_t remote_node)
-    {
-        // this allocates a host-managed index for the remote nodes msg "queue"
-        // so the host knows which buffers are available on the target
-        const size_t remote_buffer_index = peers[remote_node].remote_buffer_pool.allocate();
-        // this allocates an index in the hosts "reply queue"
-        // request is included in offload message, so the target knows into which buffers replys must be written
-        // when used, the index will need to be added to an offset determined by a targets rank to address the part of the buffer belonging to this target
-        // NOTE: the actual host buffer is stored at the hosts peers[0], but the buffer_pools are stored at the corresponding peers[target]
-        // buffer_pools manage idices within the targets section of the hosts buffer
-        const size_t local_buffer_index = peers[remote_node].local_buffer_pool.allocate();
-
-        peers[remote_node].next_request = {remote_node, this_node_, remote_buffer_index, local_buffer_index};
-
-        return peers[remote_node].next_request;
-    }
-
-    // only used by host
+	// this is only used by the host to manage remote msg buffers and local reply buffers and assign them to requests
+	const request& allocate_next_request(node_t remote_node)
+	{
+		// this allocates a host-managed index for the remote nodes msg "queue"
+		// so the host knows which buffers are available on the target
+		const size_t remote_buffer_index = peers[remote_node].remote_buffer_pool.allocate();
+		// this allocates an index in the hosts "reply queue"
+		// request is included in offload message, so the target knows into which buffers replys must be written
+		// when used, the index will need to be added to an offset determined by a targets rank to address the part of the buffer belonging to this target
+		// NOTE: the actual host buffer is stored at the hosts peers[0], but the buffer_pools are stored at the corresponding peers[target]
+		// buffer_pools manage idices within the targets section of the hosts buffer
+		const size_t local_buffer_index = peers[remote_node].local_buffer_pool.allocate();
+
+		peers[remote_node].next_request = {remote_node, this_node_, remote_buffer_index, local_buffer_index};
+
+		return peers[remote_node].next_request;
+	}
+
+	// only used by host
 	request allocate_request(node_t remote_node)
 	{
-        HAM_DEBUG( HAM_LOG << "communicator::allocate_next_request(): remote_node = " << remote_node << std::endl; )
+		HAM_DEBUG( HAM_LOG << "communicator::allocate_next_request(): remote_node = " << remote_node << std::endl; )
 
 		return peers[remote_node].next_request;
 	}
 
-    // used for rma data transfers, so they wont take up unneeded buffer indices
-    // only put() and get() use this, copy() offloads an active msg to the data source and therefore uses allocate_request()
-    request allocate_data_request(node_t remote_node) {
-        HAM_DEBUG( HAM_LOG << "communicator::allocate_next_request(): remote_node = " << remote_node << std::endl; )
-        return { remote_node, this_node_, NO_BUFFER_INDEX, NO_BUFFER_INDEX };
-    }
+	// used for rma data transfers, so they wont take up unneeded buffer indices
+	// only put() and get() use this, copy() offloads an active msg to the data source and therefore uses allocate_request()
+	request allocate_data_request(node_t remote_node) {
+		HAM_DEBUG( HAM_LOG << "communicator::allocate_next_request(): remote_node = " << remote_node << std::endl; )
+		return { remote_node, this_node_, NO_BUFFER_INDEX, NO_BUFFER_INDEX };
+	}
 
-    // only used by host
+	// only used by host
 	void free_request(request& req)
 	{
 		assert(req.valid());
 		assert(req.source_node == this_node_);
 
-        // dont do any of the following for data transfer requests
-        if(req.remote_buffer_index == NO_BUFFER_INDEX ) {
-            return;
-        }
+		// dont do any of the following for data transfer requests
+		if(req.remote_buffer_index == NO_BUFFER_INDEX ) {
+			return;
+		}
 
-        mpi_peer& peer = peers[req.target_node];
+		mpi_peer& peer = peers[req.target_node];
 
 
-        // reset local flag
-        // local flag is inside the hosts large array of msg_flag_buffers @ peers[host]
-        // index offset computed using target node
-        // as this is an access to rma window memory, we need to lock again...
-        MPI_Win_lock(MPI_LOCK_EXCLUSIVE, this_node_, 0, peers[this_node_].msg_flag_win);
-        size_t offset = constants::MSG_BUFFERS * req.target_node; // offset msg_flag_buffers to the corresponding nodes region
-        peers[this_node_].msg_flag_data.get()[offset + req.local_buffer_index].flag = FLAG_FALSE;
-        MPI_Win_unlock(this_node_, peers[this_node_].msg_flag_win);
-        // remote flag on target
-        /* This is done by the target after having received the new index to poll on
-        size_t remote_flag = FLAG_FALSE;
-        MPI_Put(&remote_flag, sizeof(remote_flag), MPI_BYTE, req.target_node, 0, sizeof(remote_flag), MPI_BYTE, peer.flag_win);
-        // flush? don't think so
-        */
+		// reset local flag
+		// local flag is inside the hosts large array of msg_flag_buffers @ peers[host]
+		// index offset computed using target node
+		// as this is an access to rma window memory, we need to lock again...
+		MPI_Win_lock(MPI_LOCK_EXCLUSIVE, this_node_, 0, peers[this_node_].msg_flag_win);
+		size_t offset = constants::MSG_BUFFERS * req.target_node; // offset msg_flag_buffers to the corresponding nodes region
+		peers[this_node_].msg_flag_data.get()[offset + req.local_buffer_index].flag = FLAG_FALSE;
+		MPI_Win_unlock(this_node_, peers[this_node_].msg_flag_win);
 
-        peer.remote_buffer_pool.free(req.remote_buffer_index);
+		peer.remote_buffer_pool.free(req.remote_buffer_index);
 
-        peer.local_buffer_pool.free(req.local_buffer_index);
+		peer.local_buffer_pool.free(req.local_buffer_index);
 
-        req.valid_ = false;
-    }
+		req.valid_ = false;
+	}
 
 public:
-    // make private?!
-    // called by func below
-    void send_msg(node_t node, size_t buffer_index, size_t next_buffer_index, void* msg, size_t size) {
-        // write msg to target msg buffer
-        HAM_DEBUG( HAM_LOG << "communicator::send_msg(): node =  " << node << std::endl; )
-        HAM_DEBUG( HAM_LOG << "communicator::send_msg(): remote buffer index = " << buffer_index << std::endl; )
-
-        if (node != host_node_) { // to targets
-            // ham::util::time::statistics msg_put(1,0);
-            // ham::util::time::statistics flush(1,0);
-            // ham::util::time::statistics flag_put(1,0);
-
-            // ham::util::time::timer t1;
-            MPI_Win_lock(MPI_LOCK_EXCLUSIVE, node, 0, peers[node].msg_flag_win);
-            // put msg
+	// make private?!
+	// called by func below
+	void send_msg(node_t node, size_t buffer_index, size_t next_buffer_index, void* msg, size_t size) {
+		// write msg to target msg buffer
+		HAM_DEBUG( HAM_LOG << "communicator::send_msg(): node =  " << node << std::endl; )
+		HAM_DEBUG( HAM_LOG << "communicator::send_msg(): remote buffer index = " << buffer_index << std::endl; )
+
+		if (node != host_node_) { // to targets
+
+			MPI_Win_lock(MPI_LOCK_EXCLUSIVE, node, 0, peers[node].msg_flag_win);
+			// put msg
 			MPI_Put(msg, size, MPI_BYTE, node, sizeof(msg_flag_buffer) * buffer_index, size, MPI_BYTE, peers[node].msg_flag_win);
 			// put flag
 			MPI_Put(&next_buffer_index, sizeof(next_buffer_index), MPI_BYTE, node, sizeof(msg_flag_buffer) * buffer_index + constants::MSG_SIZE, sizeof(next_buffer_index), MPI_BYTE, peers[node].msg_flag_win);
-			// msg_put.add(t1);
+
 			MPI_Win_unlock(node, peers[node].msg_flag_win);
+
 			HAM_DEBUG( HAM_LOG << "communicator::send_msg(): wrote msg + flag" << std::endl; )
 
-			// unlock includes flush, no need for it here
-			// ham::util::time::timer t2;
-			// MPI_Win_flush(node, peers[node].msg_win);
-			// flush.add(t2);
-			// HAM_DEBUG( HAM_LOG << "communicator::send_msg(): flushed msg" << std::endl; )
-			// HAM_DEBUG( HAM_LOG << "communicator::send_msg(): flushing msg took: " << ""/*flush.min().count()*/ << std::endl; )
-
-			// write flag to target flags buffer
-			// not sure on the size here?
-			// ham::util::time::timer t3;
-			// MPI_Win_lock(MPI_LOCK_EXCLUSIVE, node, 0, peers[node].msg_flag_win);
-            // flag_put.add(t3);
-            // HAM_DEBUG( HAM_LOG << "communicator::send_msg(): wrote flag" << std::endl; )
-            // HAM_DEBUG( HAM_LOG << "communicator::send_msg(): writing flag took: " << ""/*flag_put.min().count()*/ <<std::endl; )
-            // MPI_Win_unlock(node, peers[node].msg_flag_win);
-
-        } else { // to host, used by send_result
-            // ham::util::time::statistics msg_put(1,0);
-            // ham::util::time::statistics flush(1,0);
-            // ham::util::time::statistics flag_put(1,0);
-
-            // compute offset in the hosts window
-            size_t offset = constants::MSG_BUFFERS * this_node_;
-            HAM_DEBUG( HAM_LOG << "communicator::send_msg(): using msg host-offset (bytes): " << offset*sizeof(msg_buffer) << std::endl; )
-            // ham::util::time::timer t1;
-            MPI_Win_lock(MPI_LOCK_EXCLUSIVE, node, 0, peers[node].msg_flag_win);
-            // put msg/result
+		} else { // to host, used by send_result
+			
+			// compute offset in the hosts window
+			size_t offset = constants::MSG_BUFFERS * this_node_;
+			HAM_DEBUG( HAM_LOG << "communicator::send_msg(): using msg host-offset (bytes): " << offset*sizeof(msg_buffer) << std::endl; )
+
+			MPI_Win_lock(MPI_LOCK_EXCLUSIVE, node, 0, peers[node].msg_flag_win);
+			// put msg/result
 			MPI_Put(msg, size, MPI_BYTE, node, sizeof(msg_flag_buffer) * (offset + buffer_index), size, MPI_BYTE, peers[node].msg_flag_win);
-            // put flag/result notification
+			// put flag/result notification
 			MPI_Put(&next_buffer_index, sizeof(next_buffer_index), MPI_BYTE, node, sizeof(msg_flag_buffer) * (offset + buffer_index)  + constants::MSG_SIZE, sizeof(next_buffer_index), MPI_BYTE, peers[node].msg_flag_win);
 			MPI_Win_unlock(node, peers[node].msg_flag_win);
 			HAM_DEBUG( HAM_LOG << "communicator::send_msg(): wrote msg + flag" << std::endl; )
-			// HAM_DEBUG( HAM_LOG << "communicator::send_msg(): writing msg took: " << ""/*msg_put.min().count()*/ << std::endl; )
-
-			// ham::util::time::timer t2;
-			// MPI_Win_flush(node, peers[node].msg_win);
-			// flush.add(t2);
-			// HAM_DEBUG( HAM_LOG << "communicator::send_msg(): flushed msg" << std::endl; )
-			// HAM_DEBUG( HAM_LOG << "communicator::send_msg(): flushing msg took: " << ""/*flush.min().count()*/ << std::endl; )
-
-			// ham::util::time::timer t3;
-			// HAM_DEBUG( HAM_LOG << "communicator::send_msg(): using flag host-offset (bytes): " << offset*sizeof(cache_line_buffer) << std::endl; )
-			// MPI_Win_lock(MPI_LOCK_EXCLUSIVE, node, 0, peers[node].msg_flag_win);
-            // flag_put.add(t3);
-            // MPI_Win_unlock(node, peers[node].msg_flag_win);
-            // HAM_DEBUG( HAM_LOG << "communicator::send_msg(): wrote flag" << std::endl; )
-            // HAM_DEBUG( HAM_LOG << "communicator::send_msg(): writing flag took: " << ""/*flag_put.min().count()*/ <<std::endl; )
-
-        }
-    }
-    // this is used by the host
+		}
+	}
+
+	// this is used by the host
 	void send_msg(request_reference_type req, void* msg, size_t size) {
-        const request& next_req = allocate_next_request(req.target_node); // this is only required for the host
-        send_msg(req.target_node, req.remote_buffer_index, next_req.remote_buffer_index, msg, size);
-    }
+		const request& next_req = allocate_next_request(req.target_node); // this is only required for the host
+		send_msg(req.target_node, req.remote_buffer_index, next_req.remote_buffer_index, msg, size);
+	}
 
-    // make private?!
-    // called by function below
-    void* recv_msg(node_t node, size_t buffer_index = NO_BUFFER_INDEX, void* msg = nullptr, size_t size = constants::MSG_SIZE) {
+	// make private?!
+	// called by function below
+	void* recv_msg(node_t node, size_t buffer_index = NO_BUFFER_INDEX, void* msg = nullptr, size_t size = constants::MSG_SIZE) {
 		buffer_index = buffer_index == NO_BUFFER_INDEX ? peers[node].next_flag : buffer_index;
 		HAM_DEBUG(HAM_LOG << "communicator::recv_msg(): remote node is: " << node << std::endl; )
 		HAM_DEBUG(HAM_LOG << "communicator::recv_msg(): using buffer index: " << buffer_index << std::endl; )
 
-
 		// size_t *local_flag;
 		flag_t received_flag = FLAG_FALSE;
 
@@ -488,7 +396,7 @@ class communicator {
 			MPI_Get(&received_flag, sizeof(flag_t), MPI_BYTE, this_node_,
 					sizeof(msg_flag_buffer) * (offset + buffer_index) + constants::MSG_SIZE, sizeof(flag_t),
 					MPI_BYTE, peers[this_node_].msg_flag_win);
-			// using a get here, by standard just accessing the memory should be okay too, like below
+			// using a get here, with proper unified memory model just accessing the memory should be okay too, like below
 			// received_flag = peers[this_node_].msg_flag_data.get()[offset + buffer_index].flag);
 			MPI_Win_unlock(this_node_, peers[this_node_].msg_flag_win);
 		} // poll on flag for completion
@@ -507,19 +415,15 @@ class communicator {
 			peers[node].next_flag = received_flag;
 		}
 
-        HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): done " << std::endl; )
+		HAM_DEBUG( HAM_LOG << "communicator::recv_msg(): done " << std::endl; )
 
-        return &peers[this_node_].msg_flag_data.get()[offset + buffer_index]; // offset==0 for non-hosts
-    }
+		return &peers[this_node_].msg_flag_data.get()[offset + buffer_index]; // offset==0 for non-hosts
+	}
 
 	// to be used by the offload target's main loop: synchronously receive one message at a time
-	// NOTE: the local static receive buffer!
 	void* recv_msg_host(void* msg = nullptr, size_t size = constants::MSG_SIZE)
 	{
-		/* static msg_buffer buffer; // NOTE !
-		MPI_Recv(&buffer, size, MPI_BYTE, host_node_, constants::DEFAULT_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-        return static_cast<void*>(&buffer); */
-        return static_cast<void*>(recv_msg(host_node_, NO_BUFFER_INDEX, msg, size));
+		return static_cast<void*>(recv_msg(host_node_, NO_BUFFER_INDEX, msg, size));
 	}
 
 	// trigger asyncly receiving the result of a message on the sending side
@@ -528,27 +432,27 @@ class communicator {
 		// nothing to do here, since this communicator implementation uses one-sided communication
 		// the data will be written to where it is expected
 		HAM_DEBUG( HAM_LOG << "communicator::recv_result(): This does nothing with the MPI RMA communicator" << std::endl; )
-        return;
+		return;
 	}
 
-    // only used by the host through request.test() (top of this file) called by future.test() (offload.hpp)
-    bool test_local_flag(node_t node, size_t buffer_index) {
+	// only used by the host through request.test() (top of this file) called by future.test() (offload.hpp)
+	bool test_local_flag(node_t node, size_t buffer_index) {
 		size_t offset = constants::MSG_BUFFERS * node;
 		flag_t temp_flag = FLAG_FALSE;
 		// public window flag changes may have not have been updated in local window... so we need to lock again here
 		MPI_Win_lock(MPI_LOCK_EXCLUSIVE, this_node_, 0, peers[this_node_].msg_flag_win);
 		temp_flag = peers[node].msg_flag_data.get()[offset + buffer_index].flag;
 		MPI_Win_unlock(this_node_, peers[this_node_].msg_flag_win);
-        return temp_flag != FLAG_FALSE;
-    }
+		return temp_flag != FLAG_FALSE;
+	}
 
-    void flush_data(node_t node) {
-        MPI_Win_flush(node, peers[node].rma_data_win);
-    }
+	void flush_data(node_t node) {
+		MPI_Win_flush(node, peers[node].rma_data_win);
+	}
 
 	// this is only called @ communicator construction to initialize flags with FLAG_FALSE
 	// calling this at any other point may reset flags belonging to messages that have not yet been executed (and never will be then)
-    void reset_flags(buffer_ptr<msg_flag_buffer> msg_flags, size_t size) {
+	void reset_flags(buffer_ptr<msg_flag_buffer> msg_flags, size_t size) {
 		// now this is where a struct of arrays would have been cooler...
 		// TODO - Daniel: Ask Matthias if he knows a cooler solution
 		for (int i = 0; i <= size ; ++i) {
@@ -562,15 +466,15 @@ class communicator {
 	template<typename T>
 	void send_data(T* local_source, buffer_ptr<T> remote_dest, size_t size) {
 		// execute transfer
-        MPI_Put(local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), remote_dest.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_dest.node()].rma_data_win);
-        MPI_Win_flush(remote_dest.node(), peers[remote_dest.node()].rma_data_win);
+		MPI_Put(local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), remote_dest.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_dest.node()].rma_data_win);
+		MPI_Win_flush(remote_dest.node(), peers[remote_dest.node()].rma_data_win);
 	}
 
 	// to be used by the host only
 	template<typename T>
 	void send_data_async(request_reference_type req, T* local_source, buffer_ptr<T> remote_dest, size_t size) {
-        req.data_transfer_type = constants::DATA_PUT_CODE;
-        MPI_Rput(local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), remote_dest.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_dest.node()].rma_data_win, &req.next_mpi_request());
+		req.data_transfer_type = constants::DATA_PUT_CODE;
+		MPI_Rput(local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), remote_dest.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_dest.node()].rma_data_win, &req.next_mpi_request());
 	}
 
 	// not used in MPI RMA backend
@@ -587,8 +491,8 @@ class communicator {
 	// to be used by the host
 	template<typename T>
 	void recv_data_async(request_reference_type req, buffer_ptr<T> remote_source, T* local_dest, size_t size) {
-        req.data_transfer_type = constants::DATA_GET_CODE;
-        MPI_Rget(local_dest, size * sizeof(T), MPI_BYTE, remote_source.node(), remote_source.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_source.node()].rma_data_win, &req.next_mpi_request());
+		req.data_transfer_type = constants::DATA_GET_CODE;
+		MPI_Rget(local_dest, size * sizeof(T), MPI_BYTE, remote_source.node(), remote_source.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_source.node()].rma_data_win, &req.next_mpi_request());
 	}
 
 	template<typename T>
@@ -596,11 +500,10 @@ class communicator {
 	{
 		T* ptr;
 
-		// posix_memalign((void**)&ptr, constants::CACHE_LINE_SIZE, n * sizeof(T));
 		MPI_Alloc_mem(n * sizeof(T), MPI_INFO_NULL, &ptr);
-        // attach to own window
-        HAM_DEBUG( HAM_LOG << "communicator::allocate_buffer(), allocating buffer @: " << (long)ptr << std::endl; )
-        MPI_Win_attach(peers[this_node_].rma_data_win, (void*)ptr, n * sizeof(T));
+		// attach to own window
+		HAM_DEBUG( HAM_LOG << "communicator::allocate_buffer(), allocating buffer @: " << (long)ptr << std::endl; )
+		MPI_Win_attach(peers[this_node_].rma_data_win, (void*)ptr, n * sizeof(T));
 
 		MPI_Aint mpi_address;
 		MPI_Get_address((void*)ptr, &mpi_address);
@@ -612,38 +515,32 @@ class communicator {
 	template<typename T>
 	buffer_ptr<T> allocate_peer_buffer(const size_t n, node_t source_node)
 	{
-        T* ptr;
-		// posix_memalign((void**)&ptr, constants::CACHE_LINE_SIZE, n * sizeof(T)); // if you revert to memalign, also change back free in free_peer_buffer()
-		// using MPI_Alloc instead as these buffers are used for RMA accesses
+		T* ptr;
+		// using MPI_Alloc instead of posix_memalign as these buffers are used for RMA accesses
 		MPI_Alloc_mem(n * sizeof(T), MPI_INFO_NULL, &ptr);
 		// NOTE: no ctor is called
 		return buffer_ptr<T>(ptr, this_node_);
 	}
 
-    // used for data buffers only
+	// used for data buffers only
 	template<typename T>
 	void free_buffer(buffer_ptr<T> ptr)
 	{
 		assert(ptr.node() == this_node_);
 		// NOTE: no dtor is called
-        // remove from own rma window
-        HAM_DEBUG( HAM_LOG << "communicator::free_buffer(), freeing buffer @: " << (long)ptr.get() << " belonging to node: " << ptr.node() << std::endl; )
-        MPI_Win_detach(peers[this_node_].rma_data_win, ptr.get());
-        /* for (node_t i = 1; i < nodes_; ++i) { // nonsense, all accesses to a rank will only take place on that targets window, no need to attach to other
-            MPI_Win_detach(peers[i].rma_data_win, ptr.get());
-        } */
-		// free(static_cast<void*>(ptr.get())); // switch back to this if you revert back from using MPI_alloc_mem()
+		// remove from own rma window
+		HAM_DEBUG( HAM_LOG << "communicator::free_buffer(), freeing buffer @: " << (long)ptr.get() << " belonging to node: " << ptr.node() << std::endl; )
+		MPI_Win_detach(peers[this_node_].rma_data_win, ptr.get());
 		MPI_Free_mem(ptr.get());
 	}
 
-    // for host to free peer message buffers, needed because original function now manages rma window which must not happen for host-only local buffers
+	// for host to free peer message buffers, needed because original function now manages rma window which must not happen for host-only local buffers
 	template<typename T>
 	void free_peer_buffer(buffer_ptr<T> ptr)
 	{
-        // this will never be called on the actual memory mapped to static mpi windows, freeing it would equal "disconnecting" the corresponding target
+		// this must never be called on the actual memory mapped to static mpi windows, freeing it would equal "disconnecting" the corresponding target
 		assert(ptr.node() == this_node_);
 		// NOTE: no dtor is called
-		// free(static_cast<void*>(ptr.get())); // switch back to this if you revert back from using MPI_alloc_mem()
 		MPI_Free_mem(ptr.get());
 	}
 
@@ -665,30 +562,27 @@ class communicator {
 	node_t host_node_;
 	std::vector<node_descriptor> node_descriptions; // not as member in peer below, because Allgather is used to exchange node descriptions
 
-    struct mpi_peer {
-
-		// buffer_ptr<msg_buffer> msg_buffers; // buffers used for MPI_ISend and IRecv by the sender // not needed for RMA version, host-side RMA window is used instead
-
+	struct mpi_peer {
 		// needed by sender to manage which buffers are in use and which are free
 		// just manages indices, that can be used by
 		detail::resource_pool<size_t> local_buffer_pool;
-        detail::resource_pool<size_t> remote_buffer_pool;
+		detail::resource_pool<size_t> remote_buffer_pool;
 
-        request next_request;
-        size_t next_flag = 0;
-        // NOTE: behind these buffers are MSG_BUFFERS many buffers of size MSG_SIZE+FLAG_SIZE, indices are managed by buffer_pool
+		request next_request;
+		size_t next_flag = 0;
+		// NOTE: behind these buffers are MSG_BUFFERS many buffers of size MSG_SIZE+FLAG_SIZE, indices are managed by buffer_pool
 
-        // static window for inbound rma messages and their flags
-        buffer_ptr<msg_flag_buffer> msg_flag_data;
-        MPI_Win msg_flag_win;
+		// static window for inbound rma messages and their flags
+		buffer_ptr<msg_flag_buffer> msg_flag_data;
+		MPI_Win msg_flag_win;
 
 		// mpi rma dynamic window for data
 		MPI_Win rma_data_win;
 	};
 
-
 	mpi_peer* peers;
-    };
+
+};
 
 template<typename T>
 buffer_ptr<T>::buffer_ptr() : buffer_ptr(nullptr, communicator::this_node()) { }
@@ -703,4 +597,4 @@ T& buffer_ptr<T>::operator[](size_t i)
 } // namespace net
 } // namespace ham
 
-#endif // ham_net_communicator_mpi_hpp
+#endif // ham_net_communicator_mpi_rma_dynamic_hpp
diff --git a/include/ham/net/communicator_mpi_rma_dynamic_data_only.hpp b/include/ham/net/communicator_mpi_rma_dynamic_data_only.hpp
index 4dff738..eb2c762 100644
--- a/include/ham/net/communicator_mpi_rma_dynamic_data_only.hpp
+++ b/include/ham/net/communicator_mpi_rma_dynamic_data_only.hpp
@@ -3,8 +3,8 @@
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 
-#ifndef ham_net_communicator_mpi_rma_dynamic_hpp
-#define ham_net_communicator_mpi_rma_dynamic_hpp
+#ifndef ham_net_communicator_mpi_rma_dynamic_data_only_hpp
+#define ham_net_communicator_mpi_rma_dynamic_data_only_hpp
 
 #include <mpi.h>
 
@@ -51,9 +51,6 @@ class buffer_ptr {
 class node_descriptor
 {
 public:
-	//node_descriptor() : name(MPI_MAX_PROCESSOR_NAME, 0) {}
-
-	//const std::string& name() const { return name_; }
 	const char* name() const { return name_; }
 private:
 	//std::string name_; // TODO(improvement): unify node description for all back-ends, NOTE: std::string is not trivally transferable
@@ -74,18 +71,17 @@ class communicator {
 		{}
 
 		// return true if request was finished
-        	// will not work as intended for rma ops, no equivalent to test() available for remote completion
+		// will not work as intended for rma ops, no equivalent to test() available for remote completion
 		bool test()
 		{
 			int flag = 0;
 			MPI_Testall(req_count, mpi_reqs, &flag, MPI_STATUS_IGNORE); // just test the receive request, since the send belonging to the request triggers the remote send that is received
 
-            if(uses_rma_)
-            {
-                HAM_DEBUG( HAM_LOG << "request::test(), warning: may give false positive on rma remote completion" << std::endl; )
-            }
-
-            return flag != 0;
+			if(uses_rma_)
+			{
+				HAM_DEBUG( HAM_LOG << "request::test(), warning: may give false positive on rma remote completion" << std::endl; )
+			}
+	    		return flag != 0;
 		}
 
 		void* get() // blocks
@@ -93,10 +89,10 @@ class communicator {
 			HAM_DEBUG( HAM_LOG << "request::get(), before MPI_Waitall()" << std::endl; )
 			MPI_Waitall(req_count, mpi_reqs, MPI_STATUS_IGNORE); // must wait for all requests to satisfy the standard
 			HAM_DEBUG( HAM_LOG << "request::get(), after MPI_Waitall()" << std::endl; )
-            if(uses_rma_)
-            {
-                MPI_Win_flush(target_node, communicator::instance().peers[target_node].rma_win);
-            }
+		if(uses_rma_)
+		{
+			MPI_Win_flush(target_node, communicator::instance().peers[target_node].rma_win);
+		}
 			return static_cast<void*>(&communicator::instance().peers[target_node].msg_buffers[recv_buffer_index]);
 		}
 
@@ -114,10 +110,10 @@ class communicator {
 			return valid_;
 		}
 
-        bool uses_rma() const
-        {
-            return uses_rma_;
-        }
+		bool uses_rma() const
+		{
+		    return uses_rma_;
+		}
 
 		MPI_Request& next_mpi_request()
 		{
@@ -129,7 +125,7 @@ class communicator {
 		node_t target_node;
 		node_t source_node;
 		bool valid_;
-        bool uses_rma_;
+		bool uses_rma_;
 
 		// only needed by the sender
 		enum { NUM_REQUESTS = 3 };
@@ -184,32 +180,32 @@ class communicator {
 		HAM_DEBUG( HAM_LOG << "communicator::communicator(): gathering node descriptions done" << std::endl; )
 
 
-        if (is_host()) {
+		if (is_host()) {
 
-            for (node_t i = 1; i < nodes_; ++i) { // TODO(improvement): needs to be changed when host-rank becomes configurable
-                // allocate buffers
-                peers[i].msg_buffers = allocate_peer_buffer<msg_buffer>(constants::MSG_BUFFERS, this_node_);
-                // fill resource pools
-                for (size_t j = constants::MSG_BUFFERS; j > 0; --j) {
-                    peers[i].buffer_pool.add(j - 1);
-                }
-            }
-        }
+		    for (node_t i = 1; i < nodes_; ++i) { // TODO(improvement): needs to be changed when host-rank becomes configurable
+			// allocate buffers
+			peers[i].msg_buffers = allocate_peer_buffer<msg_buffer>(constants::MSG_BUFFERS, this_node_);
+			// fill resource pools
+			for (size_t j = constants::MSG_BUFFERS; j > 0; --j) {
+			    peers[i].buffer_pool.add(j - 1);
+			}
+		    }
+		}
 
-        // initialise 1 global window per target for data
-        for (node_t i = 1; i < nodes_; ++i) {
-            MPI_Win_create_dynamic(MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[i].rma_win));
-        }
+		// initialise 1 global window per target for data
+		for (node_t i = 1; i < nodes_; ++i) {
+		    MPI_Win_create_dynamic(MPI_INFO_NULL, MPI_COMM_WORLD, &(peers[i].rma_win));
+		}
 
-	// get all locks to targets
-        // targets lock to other targets for copies
-        for (node_t i = 1; i < nodes_; ++i) { // TODO(improvement): needs to be changed when host-rank becomes configurable
-            if(i != this_node_) {
-                MPI_Win_lock(MPI_LOCK_SHARED, i, 0, peers[i].rma_win);  // shared locks because all ranks lock on every target concurrently
-            }
-        }
+		// get all locks to targets
+		// targets lock to other targets for copies
+		for (node_t i = 1; i < nodes_; ++i) { // TODO(improvement): needs to be changed when host-rank becomes configurable
+		    if(i != this_node_) {
+			MPI_Win_lock(MPI_LOCK_SHARED, i, 0, peers[i].rma_win);  // shared locks because all ranks lock on every target concurrently
+		    }
+		}
 
-        HAM_DEBUG( HAM_LOG << "communicator::communicator(): rma window creation done" << std::endl; )
+		HAM_DEBUG( HAM_LOG << "communicator::communicator(): rma window creation done" << std::endl; )
 
 	}
 
@@ -257,7 +253,7 @@ class communicator {
 	{
 		static msg_buffer buffer; // NOTE !
 		MPI_Recv(&buffer, size, MPI_BYTE, host_node_, constants::DEFAULT_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
-        return static_cast<void*>(&buffer);
+		return static_cast<void*>(&buffer);
 	}
 
 	// trigger receiving the result of a message on the sending side
@@ -277,17 +273,16 @@ class communicator {
 	{
 		// execute transfer
 		MPI_Put(local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), remote_dest.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_dest.node()].rma_win);
-        	MPI_Win_flush(remote_dest.node(), peers[remote_dest.node()].rma_win);
+		MPI_Win_flush(remote_dest.node(), peers[remote_dest.node()].rma_win);
 	}
 
 	// to be used by the host only
 	template<typename T>
 	void send_data_async(request_reference_type req, T* local_source, buffer_ptr<T> remote_dest, size_t size)
 	{
-        req.uses_rma_ = true;
+		req.uses_rma_ = true;
 
-        // MPI_Win_lock(MPI_LOCK_SHARED, remote_dest.node(), 0, peers[remote_dest.node()].rma_win);
-        MPI_Rput(local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), remote_dest.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_dest.node()].rma_win, &req.next_mpi_request());
+		MPI_Rput(local_source, size * sizeof(T), MPI_BYTE, remote_dest.node(), remote_dest.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_dest.node()].rma_win, &req.next_mpi_request());
 	}
 
 	// not used in MPI RMA backend
@@ -305,8 +300,7 @@ class communicator {
 	template<typename T>
 	void recv_data_async(request_reference_type req, buffer_ptr<T> remote_source, T* local_dest, size_t size)
 	{
-        req.uses_rma_ = true;
-
+		req.uses_rma_ = true;
 		MPI_Rget(local_dest, size * sizeof(T), MPI_BYTE, remote_source.node(), remote_source.get_mpi_address(), size * sizeof(T), MPI_BYTE, peers[remote_source.node()].rma_win, &req.next_mpi_request());
 	}
 
@@ -315,9 +309,9 @@ class communicator {
 	{
 		T* ptr;
 		posix_memalign((void**)&ptr, constants::CACHE_LINE_SIZE, n * sizeof(T));
-        // attach to own window
-        MPI_Win_attach(peers[this_node_].rma_win, (void*)ptr, n * sizeof(T));
-        	MPI_Aint mpi_address;
+		// attach to own window
+		MPI_Win_attach(peers[this_node_].rma_win, (void*)ptr, n * sizeof(T));
+		MPI_Aint mpi_address;
 		MPI_Get_address((void*)ptr, &mpi_address);
 		// NOTE: no ctor is called
 		return buffer_ptr<T>(ptr, this_node_, mpi_address);
@@ -338,8 +332,8 @@ class communicator {
 	{
 		assert(ptr.node() == this_node_);
 		// NOTE: no dtor is called
-        	// remove from own rma window
-        	MPI_Win_detach(peers[this_node_].rma_win, ptr.get());
+		// remove from own rma window
+		MPI_Win_detach(peers[this_node_].rma_win, ptr.get());
 		free(static_cast<void*>(ptr.get()));
 	}
 
@@ -397,4 +391,4 @@ T& buffer_ptr<T>::operator[](size_t i)
 } // namespace net
 } // namespace ham
 
-#endif // ham_net_communicator_mpi_hpp
+#endif // ham_net_communicator_mpi_rma_dynamic_data_only_hpp
diff --git a/include/ham/offload/offload.hpp b/include/ham/offload/offload.hpp
index dafb6da..b96c988 100644
--- a/include/ham/offload/offload.hpp
+++ b/include/ham/offload/offload.hpp
@@ -237,7 +237,7 @@ future<void> put(T* local_source, buffer_ptr<T>& remote_dest, size_t n)
 	comm.recv_result(result.get_request()); // trigger receiving the msgs result // async
 	
 	return result;
-#elif HAM_COMM_MPI_RMA_DYNAMIC
+#elif defined(HAM_COMM_MPI_RMA_DYNAMIC) || defined(HAM_COMM_MPI_RMA_DYNAMIC_DATA_ONLY)
     future<void> result(comm.allocate_data_request(remote_dest.node()));
 	HAM_DEBUG( HAM_LOG << "offload::put(): initiating RMA put..." << std::endl; )
 	comm.send_data_async(result.get_request(), local_source, remote_dest, n);
@@ -275,7 +275,7 @@ future<void> get(buffer_ptr<T> remote_source, T* local_dest, size_t n)
 	comm.recv_result(result.get_request()); // trigger receiving the result
 	// TODO(improvement): the recv_result() is not needed, could remove and remove send_result() from offload_read_msg to reduce synchronization overhead
 	return result;
-#elif defined HAM_COMM_MPI_RMA_DYNAMIC
+#elif defined(HAM_COMM_MPI_RMA_DYNAMIC) || defined(HAM_COMM_MPI_RMA_DYNAMIC_DATA_ONLY)
 	future<void> result(comm.allocate_data_request(remote_source.node()));
 	HAM_DEBUG( HAM_LOG << "offload::put(): initiating RMA get..." << std::endl; )
 	comm.recv_data_async(result.get_request(), remote_source, local_dest, n);
@@ -312,7 +312,7 @@ void get_sync(buffer_ptr<T> remote_source, T* local_dest, size_t n)
 
 //}
 
-#ifdef HAM_COMM_MPI_RMA_DYNAMIC
+#if defined(HAM_COMM_MPI_RMA_DYNAMIC) || defined(HAM_COMM_MPI_RMA_DYNAMIC_DATA_ONLY)
         template<typename T>
 future<void> copy(buffer_ptr<T> source, buffer_ptr<T> dest, size_t n)
 {
@@ -342,7 +342,7 @@ void copy_sync(buffer_ptr<T> source, buffer_ptr<T> dest, size_t n)
 // fix 1st arg:
 //	comm.send_data(src_node, local_source, remote_dest, n);
 //	static_assert(false, "copy is not implemented yet for the SCIF back-end");
-#elif defined HAM_COMM_MPI
+#elif defined(HAM_COMM_MPI)
 	// send corresponding write and read messages to the sender and the receiver
 
 	// issues a send operation on the source node, that sends the memory at source to the destination node
@@ -361,10 +361,10 @@ void copy_sync(buffer_ptr<T> source, buffer_ptr<T> dest, size_t n)
 	// TODO(improvement): this is oversynchronized, waiting for the target to complete receiving should be sufficient
 	read_result.get();
 	write_result.get();
-#elif defined HAM_COMM_MPI_RMA_DYNAMIC
-    // use async copy + sync
-    copy(source, dest, n).get();
-#elif defined HAM_COMM_TCP
+#elif defined(HAM_COMM_MPI_RMA_DYNAMIC) || defined(HAM_COMM_MPI_RMA_DYNAMIC_DATA_ONLY)
+	// use async copy + sync
+	copy(source, dest, n).get();
+#elif defined(HAM_COMM_TCP)
 	T* ptr;
 	posix_memalign((void**)&ptr, constants::CACHE_LINE_SIZE, n * sizeof(T));
 	get_sync(source, ptr, n);
diff --git a/include/ham/offload/offload_msg.hpp b/include/ham/offload/offload_msg.hpp
index 01f4e9d..7e381b6 100644
--- a/include/ham/offload/offload_msg.hpp
+++ b/include/ham/offload/offload_msg.hpp
@@ -6,9 +6,11 @@
 #ifndef ham_offload_offload_msg_hpp
 #define ham_offload_offload_msg_hpp
 
-#ifdef HAM_COMM_MPI_RMA_DYNAMIC
+// for the copy msg we want to store the remote memory address as MPI_Aint
+#if defined(HAM_COMM_MPI_RMA_DYNAMIC) || defined(HAM_COMM_MPI_RMA_DYNAMIC_DATA_ONLY)
 #include <mpi.h>
 #endif
+
 #include "ham/msg/active_msg.hpp"
 #include "ham/msg/execution_policy.hpp"
 #include "ham/misc/constants.hpp"
@@ -43,6 +45,7 @@ struct helper<Functor, void> {
 };
 
 // executes the functor, and send back its result
+// used for all offloads, remote allocation
 template<class Functor, template<class> class ExecutionPolicy = default_execution_policy>
 class offload_result_msg
 	: public active_msg<offload_result_msg<Functor, ExecutionPolicy>, ExecutionPolicy>
@@ -68,6 +71,7 @@ class offload_result_msg
 };
 
 // just execute the functor
+// fire & forget, not used by current HAM-Offload API
 template<class Functor, template<class> class ExecutionPolicy = default_execution_policy>
 class offload_msg
 	: public active_msg<offload_msg<Functor, ExecutionPolicy>, ExecutionPolicy>
@@ -83,7 +87,8 @@ class offload_msg
 	}
 };
 
-// should not be used by MPI_RMA_COMMUNICATOR since one-sided put is used
+// data transfer message type, triggers RECEIVING data at the target
+// not used by MPI_RMA_COMMUNICATOR since one-sided put is used
 template<typename T, template<class> class ExecutionPolicy = default_execution_policy>
 class offload_write_msg
 	: public active_msg<offload_write_msg<T, ExecutionPolicy>, ExecutionPolicy>
@@ -97,7 +102,6 @@ class offload_write_msg
 		communicator::instance().recv_data(buffer_ptr<T>(nullptr, remote_node), local_dest, n); // NOTE: Why nullptr? This is for two-sided communicators, so we do not know the remote address, but match a send operation that has the address.
 
 		// send a result to tell the sender, that the transfer is done
-        // TODO(improvement): this may be
 		if (req.valid()) {
 			req.send_result((void*)&n, sizeof n);
 		}
@@ -111,7 +115,8 @@ class offload_write_msg
 	
 };
 
-// should not be used by MPI_RMA_COMMUNICATOR since one-sided put is used
+// data transfer message type, triggers SENDING data at the target
+// not used by MPI_RMA_COMMUNICATOR since one-sided put is used
 template<typename T, template<class> class ExecutionPolicy = default_execution_policy>
 class offload_read_msg
 	: public active_msg<offload_read_msg<T, ExecutionPolicy>, ExecutionPolicy>
@@ -125,7 +130,7 @@ class offload_read_msg
 		communicator::instance().send_data(local_source, buffer_ptr<T>(nullptr, remote_node), n); // NOTE: Why nullptr? This is for two-sided communicators, so we do not know the remote address, but match a receive operation that has the address.
 		
 		// send a result message to tell the sender, that the transfer is done
-        // TODO(improvement): this may be removed along with receiving the result in offload get()
+		// TODO(improvement, potential speedup): this may be removed along with receiving the result in offload get(). For host-target transfer completion of receive is sufficient, for copy the destination informs the host of completion
 		if (req.valid()) {
 			req.send_result((void*)&n, sizeof n);
 		}
@@ -138,62 +143,37 @@ class offload_read_msg
 	size_t n;
 };
 
-#ifdef HAM_COMM_MPI_RMA_DYNAMIC
-    template<typename T, template<class> class ExecutionPolicy = default_execution_policy>
-    class offload_rma_copy_msg
-            : public active_msg<offload_rma_copy_msg<T, ExecutionPolicy>, ExecutionPolicy>
-    {
-    public:
-        offload_rma_copy_msg(communicator::request req, node_t remote_node, MPI_Aint remote_addr, T* local_source, size_t n)
-                : req(req), remote_node(remote_node), remote_addr(remote_addr), local_source(local_source), n(n) { }
-
-        void operator()() //const
-        {
-        /*   communicator::instance().establish_rma_path(remote_node); // should quickly return if path already exists
-            // attach existing buffers to new target window ?!?
-        */
-            communicator::instance().send_data(local_source, buffer_ptr<T>(nullptr, remote_node, remote_addr), n);
-
-            // send a result message to tell the sender, that the transfer is done
-            if (req.valid()) {
-                req.send_result((void*)&n, sizeof n);
-            }
-        }
-    private:
-        communicator::request req; // TODO(improvement, high priority): use a subset of req here!
-
-        node_t remote_node;
-        MPI_Aint remote_addr;
-        T* local_source;
-        size_t n;
-    };
-#endif
-
-/*
-// allows user to setup an rma link between two targets without a copy transfer
-#ifdef HAM_COMM_MPI_RMA_DYNAMIC
-    template<typename T, template<class> class ExecutionPolicy = default_execution_policy>
-    class setup_rma_path_msg
-            : public active_msg<setup_rma_path_msg<T, ExecutionPolicy>, ExecutionPolicy>
-    {
-    public:
-        setup_rma_path_msg(node_t remote_node)
-                : remote_node(remote_node) { }
-
-        void operator()() //const
-        {
-            communicator::instance().establish_rma_path(remote_node);
-
-            // send a result message to tell the sender that the path is set up
-            if (req.valid()) {
-                req.send_result((void*)&remote_node, sizeof remote_node);
-            }
-        }
-    private:
-        node_t remote_node;
-    };
+#if defined(HAM_COMM_MPI_RMA_DYNAMIC) || defined(HAM_COMM_MPI_RMA_DYNAMIC_DATA_ONLY)
+	// data transfer message, triggers RMA data transfer to copy target
+	// used only with MPI_RMA communicator
+	// necessary because of the target buffer's address (remote_addr)
+	template<typename T, template<class> class ExecutionPolicy = default_execution_policy>
+	class offload_rma_copy_msg
+			: public active_msg<offload_rma_copy_msg<T, ExecutionPolicy>, ExecutionPolicy>
+	{
+	public:
+		offload_rma_copy_msg(communicator::request req, node_t remote_node, MPI_Aint remote_addr, T* local_source, size_t n)
+				: req(req), remote_node(remote_node), remote_addr(remote_addr), local_source(local_source), n(n) { }
+
+		void operator()() //const
+		{
+			// MPI_RMA_COMMUNICATOR-only variant of send_data(), because of buffer address (remote_addr)
+			communicator::instance().send_data(local_source, buffer_ptr<T>(nullptr, remote_node, remote_addr), n);
+
+			// send a result message to tell the sender, that the transfer is done
+			if (req.valid()) {
+				req.send_result((void*)&n, sizeof n);
+			}
+		}
+	private:
+		communicator::request req; // TODO(improvement, high priority): use a subset of req here!
+
+		node_t remote_node;
+		MPI_Aint remote_addr; // this is why we imported mpi.h
+		T* local_source;
+		size_t n;
+	};
 #endif
-*/
 
 } // namespace detail
 } // namespace offload
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index f5dcdd7..98baf8f 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -13,8 +13,8 @@ add_subdirectory(ham)
 ## Explicit targets (not built by default)
 
 # TCP benchmarks
-# add_executable(benchmark_ham_offload_tcp benchmark_ham_offload.cpp)
-# target_link_libraries(benchmark_ham_offload_tcp ham_offload_tcp)
+add_executable(benchmark_ham_offload_tcp benchmark_ham_offload.cpp)
+target_link_libraries(benchmark_ham_offload_tcp ham_offload_tcp)
 
 # Intel LEO offload directive benchmark, requires Intel compiler
 if (CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
@@ -28,6 +28,9 @@ if (MPI_FOUND)
 
 	add_executable (benchmark_ham_offload_mpi_rma_dynamic benchmark_ham_offload.cpp)
 	target_link_libraries (benchmark_ham_offload_mpi_rma_dynamic ham_offload_mpi_rma_dynamic)
+
+	add_executable (benchmark_ham_offload_mpi_rma_dynamic_data_only benchmark_ham_offload.cpp)
+	target_link_libraries (benchmark_ham_offload_mpi_rma_dynamic_data_only ham_offload_mpi_rma_dynamic_data_only)
 endif()
 
 if (SCIF_FOUND)
@@ -42,17 +45,17 @@ add_executable(active_msgs active_msgs.cpp)
 target_link_libraries(active_msgs ham_interface)
 
 # TCP tests
-# add_executable(ham_offload_test_tcp ham_offload.cpp)
-# target_link_libraries(ham_offload_test_tcp ham_offload_tcp)
+add_executable(ham_offload_test_tcp ham_offload.cpp)
+target_link_libraries(ham_offload_test_tcp ham_offload_tcp)
 
-# add_executable(inner_product_tcp inner_product.cpp)
-# target_link_libraries(inner_product_tcp ham_offload_tcp)
+add_executable(inner_product_tcp inner_product.cpp)
+target_link_libraries(inner_product_tcp ham_offload_tcp)
 
-# add_executable(test_data_transfer_tcp test_data_transfer.cpp)
-# target_link_libraries(test_data_transfer_tcp ham_offload_tcp)
+add_executable(test_data_transfer_tcp test_data_transfer.cpp)
+target_link_libraries(test_data_transfer_tcp ham_offload_tcp)
 
-# add_executable(test_argument_transfer_tcp test_argument_transfer.cpp)
-# target_link_libraries(test_argument_transfer_tcp ham_offload_tcp)
+add_executable(test_argument_transfer_tcp test_argument_transfer.cpp)
+target_link_libraries(test_argument_transfer_tcp ham_offload_tcp)
 
 if (MPI_FOUND)
 # two-sided MPI
@@ -71,10 +74,11 @@ if (MPI_FOUND)
 	add_executable(test_argument_transfer_mpi test_argument_transfer.cpp)
 	target_link_libraries(test_argument_transfer_mpi ham_offload_mpi)
 
+# Streaming Test MPI
 	add_executable(test_streams_mpi test_streams.cpp)
 	target_link_libraries(test_streams_mpi ham_offload_mpi)
 
-# RMA MPI
+# RMA DYNAMIC MPI (full)
 
 	add_executable(ham_offload_test_mpi_rma_dynamic ham_offload.cpp)
 	target_link_libraries(ham_offload_test_mpi_rma_dynamic ham_offload_mpi_rma_dynamic)
@@ -88,6 +92,20 @@ if (MPI_FOUND)
 	add_executable(test_argument_transfer_mpi_rma_dynamic test_argument_transfer.cpp)
 	target_link_libraries(test_argument_transfer_mpi_rma_dynamic ham_offload_mpi_rma_dynamic)
 
+# RMA DYNAMIC MPI (data only)
+	
+	add_executable(ham_offload_test_mpi_rma_dynamic_data_only ham_offload.cpp)
+	target_link_libraries(ham_offload_test_mpi_rma_dynamic_data_only ham_offload_mpi_rma_dynamic_data_only)
+
+	add_executable(inner_product_mpi_rma_dynamic_data_only inner_product.cpp)
+	target_link_libraries(inner_product_mpi_rma_dynamic_data_only ham_offload_mpi_rma_dynamic_data_only)
+
+	add_executable(test_data_transfer_mpi_rma_dynamic_data_only test_data_transfer.cpp)
+	target_link_libraries(test_data_transfer_mpi_rma_dynamic_data_only ham_offload_mpi_rma_dynamic_data_only)
+
+	add_executable(test_argument_transfer_mpi_rma_dynamic_data_only test_argument_transfer.cpp)
+	target_link_libraries(test_argument_transfer_mpi_rma_dynamic_data_only ham_offload_mpi_rma_dynamic_data_only)
+
 endif()
 
 if (SCIF_FOUND)
diff --git a/src/ham/CMakeLists.txt b/src/ham/CMakeLists.txt
index 8108980..4a24c8b 100644
--- a/src/ham/CMakeLists.txt
+++ b/src/ham/CMakeLists.txt
@@ -22,21 +22,21 @@ set(HAM_LIB_SRC
 	util/cpu_affinity.cpp)
 
 # TCP
-#add_library(ham_offload_tcp # SHARED if BUILD_SHARED_LIBS = TRUE
-#		net/communicator.cpp
-#		net/communicator_tcp.cpp
-#		offload/runtime.cpp
-#		offload/offload.cpp
-#		offload/stream.cpp
-#		offload/main.cpp
-#		util/cpu_affinity.cpp)
-#target_compile_definitions(ham_offload_tcp PUBLIC -DHAM_COMM_TCP=1)
-#target_link_libraries(ham_offload_tcp PUBLIC ham_interface boost_library pthread)
+add_library(ham_offload_tcp # SHARED if BUILD_SHARED_LIBS = TRUE
+		net/communicator.cpp
+		net/communicator_tcp.cpp
+		offload/runtime.cpp
+		offload/offload.cpp
+		offload/stream.cpp
+		offload/main.cpp
+		util/cpu_affinity.cpp)
+target_compile_definitions(ham_offload_tcp PUBLIC -DHAM_COMM_TCP=1)
+target_link_libraries(ham_offload_tcp PUBLIC ham_interface boost_library pthread)
 
-#set_target_properties(ham_offload_tcp PROPERTIES
-#		CXX_STANDARD 11
-#		CXX_STANDARD_REQUIRED YES
-#		CXX_EXTENSIONS NO)
+set_target_properties(ham_offload_tcp PROPERTIES
+		CXX_STANDARD 11
+		CXX_STANDARD_REQUIRED YES
+		CXX_EXTENSIONS NO)
 
 if (MPI_FOUND)
 	add_library(ham_offload_mpi # SHARED if BUILD_SHARED_LIBS = TRUE
@@ -60,7 +60,14 @@ if (MPI_FOUND)
 	target_compile_definitions(ham_offload_mpi_rma_dynamic PUBLIC -DHAM_COMM_MPI_RMA_DYNAMIC=1)
 	target_link_libraries(ham_offload_mpi_rma_dynamic PUBLIC ham_interface mpi_library)
 
-	set_target_properties(ham_offload_mpi ham_offload_mpi_explicit ham_offload_mpi_rma_dynamic PROPERTIES
+	add_library(ham_offload_mpi_rma_dynamic_data_only # SHARED if BUILD_SHARED_LIBS = TRUE
+	            ${HAM_LIB_SRC}
+	            offload/main.cpp
+	            net/communicator_mpi_rma_dynamic_data_only.cpp)
+	target_compile_definitions(ham_offload_mpi_rma_dynamic_data_only PUBLIC -DHAM_COMM_MPI_RMA_DYNAMIC=1)
+	target_link_libraries(ham_offload_mpi_rma_dynamic_data_only PUBLIC ham_interface mpi_library)
+
+	set_target_properties(ham_offload_mpi ham_offload_mpi_explicit ham_offload_mpi_rma_dynamic ham_offload_mpi_rma_dynamic_data_only PROPERTIES
 		CXX_STANDARD 11
 		CXX_STANDARD_REQUIRED YES
 		CXX_EXTENSIONS NO)