From faa6d785d7298156406345203c4d4e3aa93547d3 Mon Sep 17 00:00:00 2001
From: Wendell Smith <wendell.smith@datadoghq.com>
Date: Fri, 25 Oct 2024 16:04:13 -0400
Subject: [PATCH 1/3] feat: add validate function for a `substrait::Plan`
 message

Also refactors the parse / validation layer a bit to separate them a bit more.
---
 rs/src/lib.rs             |  25 +++++----
 rs/src/parse/mod.rs       | 104 +++++++++++++++++++++++++++++---------
 rs/src/parse/plan.rs      |  39 +++++---------
 rs/src/parse/traversal.rs |  30 ++++++++---
 4 files changed, 133 insertions(+), 65 deletions(-)
diff --git a/rs/src/lib.rs b/rs/src/lib.rs
index 6a86e812..49652ae9 100644
--- a/rs/src/lib.rs
+++ b/rs/src/lib.rs
@@ -9,14 +9,13 @@
 //!  1) Build a [`Config`] structure to configure the validator. You can also
 //!     just use [`std::default::Default`] if you don't need to configure
 //!     anything, but you might want to at least call
-//!     `Config::add_curl_uri_resolver()` (if you're using the `curl`
-//!     feature).
-//!  2) Parse the incoming `substrait.Plan` message using [`parse()`]. This
-//!     creates a [ParseResult], containing a [tree](output::tree) structure
-//!     corresponding to the query plan that also contains diagnostics and
-//!     other annotations added by the validator.
-//!  3) You can traverse the tree yourself using [ParseResult::root], or you
-//!     can use one of the methods associated with [ParseResult] to obtain the
+//!     `Config::add_curl_uri_resolver()` (if you're using the `curl` feature).
+//!  2) Parse the incoming `substrait.Plan` message using [`parse()`] or
+//!     [`validate()`]. This creates a [ParseResult], containing a
+//!     [tree](output::tree) structure corresponding to the query plan that also
+//!     contains diagnostics and other annotations added by the validator.
+//!  3) You can traverse the tree yourself using [ParseResult::root], or you can
+//!     use one of the methods associated with [ParseResult] to obtain the
 //!     validation results you need.
 //!
 //! Note that only the binary protobuf serialization format is supported at the
@@ -171,6 +170,7 @@ mod util;
 
 use std::str::FromStr;
 
+use input::proto::substrait::Plan;
 use strum::IntoEnumIterator;
 
 // Aliases for common types used on the crate interface.
@@ -183,11 +183,18 @@ pub use output::diagnostic::Level;
 pub use output::parse_result::ParseResult;
 pub use output::parse_result::Validity;
 
-/// Validates the given substrait.Plan message and returns the parse tree.
+/// Parses and validates the given substrait [Plan] message and returns the
+/// parse tree and diagnostic results.
 pub fn parse<B: prost::bytes::Buf + Clone>(buffer: B, config: &Config) -> ParseResult {
     parse::parse(buffer, config)
 }
 
+/// Validates the given substrait [Plan] message and returns the parse tree  and
+/// diagnostic results.
+pub fn validate(plan: &Plan, config: &Config) -> ParseResult {
+    parse::validate(plan, config)
+}
+
 /// Returns an iterator that yields all known diagnostic classes.
 pub fn iter_diagnostics() -> impl Iterator<Item = Classification> {
     Classification::iter()
diff --git a/rs/src/parse/mod.rs b/rs/src/parse/mod.rs
index 03b57aaa..05d87be8 100644
--- a/rs/src/parse/mod.rs
+++ b/rs/src/parse/mod.rs
@@ -197,44 +197,86 @@ use crate::output::diagnostic;
 use crate::output::parse_result;
 use crate::output::path;
 
-/// Validates the given substrait.Plan message and returns the parse tree.
+use prost::Message;
+
+/// Represents the state of attempting to decode a buffer as a plan, or failing
+/// that, as a plan version.
+enum ParsedProtoResult {
+    /// Successful parsing
+    Parsed(proto::substrait::Plan),
+
+    /// Failed to decode into `substrait::Plan`, possibly due to a version
+    /// conflict; see `Cause`. Succeeded in decoding the version.
+    VersionParsed(proto::substrait::PlanVersion, diagnostic::Cause),
+
+    /// Failed to decode into `substrait::Plan` (first cause) and
+    /// `substrait::PlanVersion` (second cause).
+    Failed(diagnostic::Cause, diagnostic::Cause),
+}
+
+/// Parse the given buffer as a Plan if possible, PlanVersion if not, returning
+/// errors as they are generated.
+fn parse_proto<B: prost::bytes::Buf + Clone>(buffer: B) -> ParsedProtoResult {
+    // Attempt to parse the buffer as a Plan.
+    let err1 = match proto::substrait::Plan::decode(buffer.clone()) {
+        Ok(plan) => return ParsedProtoResult::Parsed(plan),
+        Err(e) => ecause!(ProtoParseFailed, e),
+    };
+
+    // Attempt to parse the buffer as a PlanVersion, as fallback.
+    match proto::substrait::PlanVersion::decode(buffer) {
+        Ok(version) => ParsedProtoResult::VersionParsed(version, err1),
+        Err(e2) => ParsedProtoResult::Failed(err1, ecause!(ProtoParseFailed, e2)),
+    }
+}
+
+/// Parses the given [`proto::substrait::Plan`] message, validates it, and
+/// returns the parse tree with diagnostic results.
 pub fn parse<B: prost::bytes::Buf + Clone>(
     buffer: B,
     config: &config::Config,
 ) -> parse_result::ParseResult {
     let mut state = context::State::default();
 
-    // Parse the normal way.
-    let err1 = match traversal::parse_proto::<proto::substrait::Plan, _, _>(
-        buffer.clone(),
-        "plan",
-        plan::parse_plan,
-        &mut state,
-        config,
-    ) {
-        Ok(parse_result) => return parse_result,
-        Err(err) => err,
-    };
+    let (err1, err2) = match parse_proto(buffer) {
+        ParsedProtoResult::Parsed(ref plan) => {
+            return traversal::validate::<proto::substrait::Plan, _>(
+                plan,
+                "plan",
+                plan::parse_plan,
+                &mut state,
+                config,
+            );
+        }
+        ParsedProtoResult::VersionParsed(ref version, err1) => {
+            return traversal::validate::<proto::substrait::PlanVersion, _>(
+                version,
+                "plan",
+                |tree, ctx| {
+                    // We have a PlanVersion, but the Plan itself failed to
+                    // decode - so include that error.
+                    diagnostic!(ctx, Error, err1);
 
-    // Parse the fallback PlanVersion message that only includes the version
-    // information.
-    let err2 = match traversal::parse_proto::<proto::substrait::PlanVersion, _, _>(
-        buffer,
-        "plan",
-        |x, y| plan::parse_plan_version(x, y, err1.clone()),
-        &mut state,
-        config,
-    ) {
-        Ok(parse_result) => return parse_result,
-        Err(err) => err,
+                    plan::parse_plan_version(tree, ctx)
+                },
+                &mut state,
+                config,
+            );
+        }
+        ParsedProtoResult::Failed(err1, err2) => (err1, err2),
     };
 
+    // --------------------------------------------------------------------------------
+    // The parser failed to decode the buffer as either a plan or plan version, so now
+    // we create a minimal parse tree with the error diagnostics from the parser.
+
     // Create a minimal root node with just the decode error
     // diagnostic.
     let mut root = proto::substrait::Plan::type_to_node();
 
     // Create a root context for it.
     let mut context = context::Context::new("plan", &mut root, &mut state, config);
+    plan::mark_experimental(&mut context);
 
     // Push the earlier diagnostic.
     context.push_diagnostic(diagnostic::RawDiagnostic {
@@ -264,3 +306,19 @@ pub fn parse<B: prost::bytes::Buf + Clone>(
 
     parse_result::ParseResult { root }
 }
+
+/// Validate the given [`proto::substrait::Plan`] message, returning the parse
+/// tree with diagnostic results.
+pub fn validate(
+    plan: &proto::substrait::Plan,
+    config: &config::Config,
+) -> parse_result::ParseResult {
+    let mut state = context::State::default();
+    traversal::validate::<proto::substrait::Plan, _>(
+        plan,
+        "plan",
+        plan::parse_plan,
+        &mut state,
+        config,
+    )
+}
diff --git a/rs/src/parse/plan.rs b/rs/src/parse/plan.rs
index 5772a0e4..74d74b8a 100644
--- a/rs/src/parse/plan.rs
+++ b/rs/src/parse/plan.rs
@@ -157,9 +157,9 @@ fn parse_version(x: &substrait::Version, y: &mut context::Context) -> diagnostic
 }
 
 /// Report the "validator is experimental" diagnostic.
-fn mark_experimental(y: &mut context::Context) {
+pub fn mark_experimental(ctx: &mut context::Context) {
     diagnostic!(
-        y,
+        ctx,
         Info,
         Experimental,
         "this version of the validator is EXPERIMENTAL. Please report issues \
@@ -168,23 +168,23 @@ fn mark_experimental(y: &mut context::Context) {
 }
 
 /// Toplevel parse function for a plan.
-pub fn parse_plan(x: &substrait::Plan, y: &mut context::Context) -> diagnostic::Result<()> {
-    mark_experimental(y);
+pub fn parse_plan(plan: &substrait::Plan, ctx: &mut context::Context) {
+    mark_experimental(ctx);
 
     // Parse the version.
-    proto_required_field!(x, y, version, parse_version);
+    proto_required_field!(plan, ctx, version, parse_version);
 
     // Handle extensions first, because we'll need their declarations to
     // correctly interpret the relations.
-    extensions::parse_plan(x, y);
+    extensions::parse_plan(plan, ctx);
 
     // Handle the relations.
-    let num_relations = proto_repeated_field!(x, y, relations, parse_plan_rel)
+    let num_relations = proto_repeated_field!(plan, ctx, relations, parse_plan_rel)
         .0
         .len();
     if num_relations == 0 {
         diagnostic!(
-            y,
+            ctx,
             Error,
             RelationRootMissing,
             "a plan must have at least one relation"
@@ -193,25 +193,14 @@ pub fn parse_plan(x: &substrait::Plan, y: &mut context::Context) -> diagnostic::
 
     // Generate an Info diagnostic for every extension definition that wasn't
     // used at any point, and can thus be safely removed.
-    extensions::check_unused_definitions(y);
-
-    Ok(())
+    extensions::check_unused_definitions(ctx);
 }
 
-/// Toplevel parse function for a plan.
-pub fn parse_plan_version(
-    x: &substrait::PlanVersion,
-    y: &mut context::Context,
-    e: diagnostic::Cause,
-) -> diagnostic::Result<()> {
-    mark_experimental(y);
-
-    // Push the diagnostic that the caller got while parsing as a complete Plan
-    // before.
-    diagnostic!(y, Error, e);
+/// Toplevel validation function for a plan. Validates that the PlanVersion
+/// matches expected format, pushing errors to the `Context`.
+pub fn parse_plan_version(tree: &substrait::PlanVersion, ctx: &mut context::Context) {
+    mark_experimental(ctx);
 
     // Parse the version.
-    proto_required_field!(x, y, version, parse_version);
-
-    Ok(())
+    proto_required_field!(tree, ctx, version, parse_version);
 }
diff --git a/rs/src/parse/traversal.rs b/rs/src/parse/traversal.rs
index d723a833..f8367434 100644
--- a/rs/src/parse/traversal.rs
+++ b/rs/src/parse/traversal.rs
@@ -587,12 +587,30 @@ pub fn parse_proto<T, F, B>(
 ) -> diagnostic::Result<parse_result::ParseResult>
 where
     T: prost::Message + InputNode + Default,
-    F: FnOnce(&T, &mut context::Context) -> diagnostic::Result<()>,
+    F: FnOnce(&T, &mut context::Context),
     B: prost::bytes::Buf,
 {
     // Run protobuf deserialization.
     let input = T::decode(buffer).map_err(|e| ecause!(ProtoParseFailed, e))?;
 
+    Ok(validate(&input, root_name, root_parser, state, config))
+}
+
+/// Validates a serialized protobuf message using the given `root_parser`
+/// function, initial state, and configuration, pushing any errors from the
+/// root_validator or unhandled children into the returned
+/// [`parse_result::ParseResult`].
+pub fn validate<T, F>(
+    input: &T,
+    root_name: &'static str,
+    root_validator: F,
+    state: &mut context::State,
+    config: &config::Config,
+) -> parse_result::ParseResult
+where
+    T: prost::Message + InputNode + Default,
+    F: FnOnce(&T, &mut context::Context),
+{
     // Create the root node.
     let mut root = input.data_to_node();
 
@@ -600,18 +618,14 @@ where
     let mut context = context::Context::new(root_name, &mut root, state, config);
 
     // Call the provided parser function.
-    let success = root_parser(&input, &mut context)
-        .map_err(|cause| {
-            diagnostic!(&mut context, Error, cause);
-        })
-        .is_ok();
+    root_validator(input, &mut context);
 
     // Handle any fields not handled by the provided parse function.
     // Only generate a warning diagnostic for unhandled children if the
     // parse function succeeded.
-    handle_unknown_children(&input, &mut context, success);
+    handle_unknown_children(input, &mut context, true);
 
-    Ok(parse_result::ParseResult { root })
+    parse_result::ParseResult { root }
 }
 
 //=============================================================================

From 652a4dcc957e760d635ef9a6fb3c5e38038ed399 Mon Sep 17 00:00:00 2001
From: Wendell Smith <wendell.smith@datadoghq.com>
Date: Mon, 28 Oct 2024 09:59:44 -0400
Subject: [PATCH 2/3] ci: add linker options for OSX

---
 c/CMakeLists.txt | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/c/CMakeLists.txt b/c/CMakeLists.txt
index 16b5a40f..f375d01f 100644
--- a/c/CMakeLists.txt
+++ b/c/CMakeLists.txt
@@ -48,12 +48,23 @@ if(SUBSTRAIT_VALIDATOR_BUILD_TESTS)
         substrait-validator-c-test
         ${CMAKE_CURRENT_SOURCE_DIR}/tests/test.cc
     )
+
+    # For OSX, link in CoreFoundation, needed for some system symbols like
+    # _CFRelease, _CFStringGetBytes, …
+    set(EXTRA_FRAMEWORK_OPTIONS "")
+    if(APPLE)
+        set(EXTRA_FRAMEWORK_OPTIONS "-framework CoreFoundation")
+    endif()
+
+
     target_link_libraries(
         substrait-validator-c-test
         gtest_main
         substrait_validator_c
+        "${EXTRA_FRAMEWORK_OPTIONS}"
     )
 
+
     include(GoogleTest)
     gtest_discover_tests(substrait-validator-c-test)
 

From 4b9ec9717874d8357b3a726f84a0a987169dbab6 Mon Sep 17 00:00:00 2001
From: Wendell Smith <wendell.smith@datadoghq.com>
Date: Mon, 28 Oct 2024 10:00:37 -0400
Subject: [PATCH 3/3] ci: update c test to also include the Experimental note

---
 c/tests/test.cc | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/c/tests/test.cc b/c/tests/test.cc
index b9131e48..34aa701b 100644
--- a/c/tests/test.cc
+++ b/c/tests/test.cc
@@ -33,12 +33,17 @@ TEST(BasicTest, BasicTest) {
   EXPECT_EQ(strlen(reinterpret_cast<const char *>(data_ptr)), data_size);
   EXPECT_EQ(
       reinterpret_cast<const char *>(data_ptr),
-      std::string("Error at plan: failed to parse as substrait.Plan: "
-                  "failed to decode Protobuf message: "
-                  "invalid wire type value: 7 (code 1001) (code 1001)\n"
-                  "Error at plan: failed to parse as substrait.PlanVersion: "
-                  "failed to decode Protobuf message: "
-                  "invalid wire type value: 7 (code 1001) (code 1001)\n"));
+      std::string(
+          "Info at plan: this version of the validator is EXPERIMENTAL. "
+          "Please report issues via "
+          "https://github.com/substrait-io/substrait-validator/issues/new "
+          "(code 0999)\n"
+          "Error at plan: failed to parse as substrait.Plan: "
+          "failed to decode Protobuf message: "
+          "invalid wire type value: 7 (code 1001) (code 1001)\n"
+          "Error at plan: failed to parse as substrait.PlanVersion: "
+          "failed to decode Protobuf message: "
+          "invalid wire type value: 7 (code 1001) (code 1001)\n"));
 
   // Free the buffer.
   substrait_validator_free_exported(data_ptr);