Skip to content

Commit

Permalink
chore(iast): more native utils and split aspects moved to fully native (
Browse files Browse the repository at this point in the history
#10456)

## Description

- Add more native utility functions like `process_flag_added_args` that
will make migrating the remaining aspects a lot easier.
- Migrate `split`, `rsplit` and `splitlines`. Average 62% performance
increase.

## Checklist
- [X] PR author has checked that all the criteria below are met
- The PR description includes an overview of the change
- The PR description articulates the motivation for the change
- The change includes tests OR the PR description describes a testing
strategy
- The PR description notes risks associated with the change, if any
- Newly-added code is easy to change
- The change follows the [library release note
guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html)
- The change includes or references documentation updates if necessary
- Backport labels are set (if
[applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting))

## Reviewer Checklist
- [x] Reviewer has checked that all the criteria below are met 
- Title is accurate
- All changes are related to the pull request's stated goal
- Avoids breaking
[API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces)
changes
- Testing strategy adequately addresses listed risks
- Newly-added code is easy to change
- Release note makes sense to a user of the library
- If necessary, author has acknowledged and discussed the performance
implications of this PR as reported in the benchmarks PR comment
- Backport labels are set in a manner that is consistent with the
[release branch maintenance
policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)

---------

Signed-off-by: Juanjo Alvarez <[email protected]>
  • Loading branch information
juanjux authored Sep 2, 2024
1 parent 666f6ec commit e36b8ea
Show file tree
Hide file tree
Showing 8 changed files with 223 additions and 175 deletions.
225 changes: 138 additions & 87 deletions ddtrace/appsec/_iast/_taint_tracking/Aspects/AspectSplit.cpp
Original file line number Diff line number Diff line change
@@ -1,114 +1,165 @@
#include "AspectSplit.h"
#include "Initializer/Initializer.h"
#include "TaintedOps/TaintedOps.h"

template<class StrType>
py::list
api_split_text(const StrType& text, const optional<StrType>& separator, const optional<int> maxsplit)
static std::optional<py::object>
handle_potential_re_split(const py::tuple& args,
const py::tuple& sliced_args,
const py::kwargs& kwargs,
const TaintRangeMapTypePtr& tx_map)
{
const auto split = text.attr("split");
const auto split_result = split(separator, maxsplit);
const py::module re = py::module::import("re");
const py::object re_pattern_type = re.attr("Pattern");
const py::object types_module = py::module::import("types");

const auto tx_map = Initializer::get_tainting_map();
if (not tx_map or tx_map->empty()) {
return split_result;
}
// re.split aspect, either with pattern as first arg or with re module
if (const py::object module_type = types_module.attr("ModuleType");
isinstance(args[0], re_pattern_type) ||
(isinstance(args[0], module_type) && std::string(py::str(args[0].attr("__name__"))) == "re" &&
(std::string(py::str(args[0].attr("__package__"))).empty() ||
std::string(py::str(args[0].attr("__package__"))) == "re"))) {

if (auto ranges = api_get_ranges(text); not ranges.empty()) {
set_ranges_on_splitted(text, ranges, split_result, tx_map, false);
const py::object split_func = args[0].attr("split");
// Create a py::slice object to slice the args from index 1 to the end
py::list result = split_func(*sliced_args, **kwargs);

if (const int offset = isinstance(args[0], re_pattern_type) ? -1 : 0;
args.size() >= (static_cast<size_t>(3) + offset) && is_tainted(args[2 + offset].ptr(), tx_map)) {
for (auto& i : result) {
if (!i.is_none() and len(i) > 0) {
copy_and_shift_ranges_from_strings(args[2 + offset], i, 0, len(i), tx_map);
}
}
}
return result;
}

return split_result;
return std::nullopt;
}

template<class StrType>
py::list
api_rsplit_text(const StrType& text, const optional<StrType>& separator, const optional<int> maxsplit)
static py::object
split_text_common(const py::object& orig_function,
const int flag_added_args,
const py::args& args,
const py::kwargs& kwargs,
const std::string& split_func)
{
const auto rsplit = text.attr("rsplit");
const auto split_result = rsplit(separator, maxsplit);
const auto tx_map = Initializer::get_tainting_map();
if (not tx_map or tx_map->empty()) {
return split_result;
PyObject* result_or_args = process_flag_added_args(orig_function.ptr(), flag_added_args, args.ptr(), kwargs.ptr());
py::tuple args_tuple;
if (PyTuple_Check(result_or_args)) {
args_tuple = py::reinterpret_borrow<py::tuple>(result_or_args);
} else {
return py::reinterpret_borrow<py::list>(result_or_args);
}

if (auto ranges = api_get_ranges(text); not ranges.empty()) {
set_ranges_on_splitted(text, ranges, split_result, tx_map, false);
const auto& text = args_tuple[0];

const py::tuple sliced_args = len(args) > 1 ? args[py::slice(1, len(args), 1)] : py::tuple(); // (,)
auto result_o = text.attr(split_func.c_str())(*sliced_args, **kwargs); // returns['', ''] WHY?

const auto tx_map = Initializer::get_tainting_map();
if (!tx_map || tx_map->empty()) {
return result_o;
}
return split_result;

TRY_CATCH_ASPECT("split_aspect", , {
if (split_func == "split") {
if (auto re_split_result = handle_potential_re_split(args_tuple, sliced_args, kwargs, tx_map);
re_split_result.has_value()) {
return *re_split_result;
}
}

auto [ranges, ranges_error] = get_ranges(text.ptr(), tx_map);
if (!ranges_error and !ranges.empty()) {
set_ranges_on_splitted(text, ranges, result_o, tx_map, false);
}
});
return result_o;
}

template<class StrType>
py::list
api_splitlines_text(const StrType& text, bool keepends)
py::object
api_splitlines_text(const py::object& orig_function,
const int flag_added_args,
const py::args& args,
const py::kwargs& kwargs)
{
const auto splitlines = text.attr("splitlines");
const auto split_result = splitlines(keepends);
const auto tx_map = Initializer::get_tainting_map();
if (not tx_map or tx_map->empty()) {
return split_result;
const auto result_or_args = py::reinterpret_borrow<py::object>(
process_flag_added_args(orig_function.ptr(), flag_added_args, args.ptr(), kwargs.ptr()));

py::tuple args_tuple;
if (py::isinstance<py::tuple>(result_or_args)) {
args_tuple = result_or_args.cast<py::tuple>();
} else {
return result_or_args.cast<py::list>();
}

if (auto ranges = api_get_ranges(text); not ranges.empty()) {
set_ranges_on_splitted(text, ranges, split_result, tx_map, keepends);
const auto& text = args_tuple[0];
const py::tuple sliced_args = len(args) > 1 ? args[py::slice(1, len(args), 1)] : py::tuple();
py::object result_o = text.attr("splitlines")(*sliced_args, **kwargs);

const auto tx_map = Initializer::get_tainting_map();
if (!tx_map || tx_map->empty()) {
return result_o;
}
return split_result;

TRY_CATCH_ASPECT("split_aspect", , {
auto [ranges, ranges_error] = get_ranges(text.ptr(), tx_map);
if (ranges_error || ranges.empty()) {
return result_o;
}

// Retrieve keepends and check that is a boolean. If not, return the original value
// because it could be a method of a different type.
bool keepends_is_other_type = false;
bool keepends = false;
if (kwargs.contains("keepends")) {
if (py::isinstance<py::bool_>(kwargs["keepends"])) {
keepends = kwargs["keepends"].cast<bool>();
} else {
keepends_is_other_type = true;
}
} else {
if (args.size() > 1) {
if (py::isinstance<py::bool_>(args[1])) {
keepends = args[1].cast<bool>();
} else {
keepends_is_other_type = true;
}
}
}

if (!keepends_is_other_type) {
set_ranges_on_splitted(text, ranges, result_o, tx_map, keepends);
}
});
return result_o;
}

void
pyexport_aspect_split(py::module& m)
{
m.def("_aspect_split",
&api_split_text<py::str>,
"text"_a,
"separator"_a = py::none(),
"maxsplit"_a = -1,
py::return_value_policy::move);
m.def("_aspect_split",
&api_split_text<py::bytes>,
"text"_a,
"separator"_a = py::none(),
"maxsplit"_a = -1,
py::return_value_policy::move);
m.def("_aspect_split",
&api_split_text<py::bytearray>,
"text"_a,
"separator"_a = py::none(),
"maxsplit"_a = -1,
py::return_value_policy::move);
m.def("_aspect_rsplit",
&api_rsplit_text<py::str>,
"text"_a,
"separator"_a = py::none(),
"maxsplit"_a = -1,
py::return_value_policy::move);
m.def("_aspect_rsplit",
&api_rsplit_text<py::bytes>,
"text"_a,
"separator"_a = py::none(),
"maxsplit"_a = -1,
py::return_value_policy::move);
m.def("_aspect_rsplit",
&api_rsplit_text<py::bytearray>,
"text"_a,
"separator"_a = py::none(),
"maxsplit"_a = -1,
py::return_value_policy::move);
// cppcheck-suppress assignBoolToPointer
m.def("_aspect_splitlines",
&api_splitlines_text<py::str>,
"text"_a,
"keepends"_a = false,
py::return_value_policy::move);
// cppcheck-suppress assignBoolToPointer
m.def("_aspect_splitlines",
&api_splitlines_text<py::bytes>,
"text"_a,
"keepends"_a = false,
py::return_value_policy::move);
// cppcheck-suppress assignBoolToPointer
m.def(
"_aspect_split",
[](const py::object& orig_function, const int flag_added_args, const py::args& args, const py::kwargs& kwargs) {
return split_text_common(orig_function, flag_added_args, args, kwargs, "split");
},
"orig_function"_a = py::none(),
"flag_added_args"_a = 0,
py::return_value_policy::move);

m.def(
"_aspect_rsplit",
[](const py::object& orig_function, const int flag_added_args, const py::args& args, const py::kwargs& kwargs) {
return split_text_common(orig_function, flag_added_args, args, kwargs, "rsplit");
},
"orig_function"_a = py::none(),
"flag_added_args"_a = 0,
py::return_value_policy::move);

m.def("_aspect_splitlines",
&api_splitlines_text<py::bytearray>,
"text"_a,
"keepends"_a = false,
&api_splitlines_text,
"orig_function"_a = py::none(),
"flag_added_args"_a = 0,
py::return_value_policy::move);
}
}
16 changes: 5 additions & 11 deletions ddtrace/appsec/_iast/_taint_tracking/Aspects/AspectSplit.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,11 @@

#include "Helpers.h"

template<class StrType>
py::list
api_split_text(const StrType& text, const optional<StrType>& separator, optional<int> maxsplit);

template<class StrType>
py::list
api_rsplit_text(const StrType& text, const optional<StrType>& separator, optional<int> maxsplit);

template<class StrType>
py::list
api_splitlines_text(const StrType& text, bool keepends);
py::object
api_splitlines_text(const py::object& orig_function,
int flag_added_args,
const py::args& args,
const py::kwargs& kwargs);

void
pyexport_aspect_split(py::module& m);
3 changes: 1 addition & 2 deletions ddtrace/appsec/_iast/_taint_tracking/Aspects/Helpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -228,9 +228,8 @@ convert_escaped_text_to_taint_text(const StrType& taint_escaped_text, TaintRange
* @param tx_map: The taint map to apply the ranges.
* @param include_separator: If the separator should be included in the splitted parts.
*/
template<class StrType>
bool
set_ranges_on_splitted(const StrType& source_str,
set_ranges_on_splitted(const py::object& source_str,
const TaintRangeRefs& source_ranges,
const py::list& split_result,
const TaintRangeMapTypePtr& tx_map,
Expand Down
33 changes: 31 additions & 2 deletions ddtrace/appsec/_iast/_taint_tracking/Aspects/Helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,8 @@ template<class StrType>
std::tuple<StrType, TaintRangeRefs>
convert_escaped_text_to_taint_text(const StrType& taint_escaped_text, TaintRangeRefs ranges_orig);

template<class StrType>
bool
set_ranges_on_splitted(const StrType& source_str,
set_ranges_on_splitted(const py::object& source_str,
const TaintRangeRefs& source_ranges,
const py::list& split_result,
const TaintRangeMapTypePtr& tx_map,
Expand Down Expand Up @@ -176,6 +175,36 @@ as_formatted_evidence(StrType& text,
return StrType(EVIDENCE_MARKS::BLANK).attr("join")(res_vector);
}

inline PyObject*
process_flag_added_args(PyObject* orig_function, const int flag_added_args, PyObject* args, PyObject* kwargs)
{
// If orig_function is not None and not the built-in str, bytes, or bytearray, slice args
auto orig_function_type = Py_TYPE(orig_function);

if (orig_function != Py_None && orig_function_type != &PyUnicode_Type && orig_function_type != &PyByteArray_Type &&
orig_function_type != &PyBytes_Type) {

if (flag_added_args > 0) {
Py_ssize_t num_args = PyTuple_Size(args);
PyObject* sliced_args = PyTuple_New(num_args - flag_added_args);
for (Py_ssize_t i = 0; i < num_args - flag_added_args; ++i) {
PyTuple_SET_ITEM(sliced_args, i, PyTuple_GetItem(args, i + flag_added_args));
Py_INCREF(PyTuple_GetItem(args, i + flag_added_args));
}
// Call the original function with the sliced args and return its result
PyObject* result = PyObject_Call(orig_function, sliced_args, kwargs);
Py_DECREF(sliced_args);
return result;
}
// Else: call the original function with all args if no slicing is needed
return PyObject_Call(orig_function, args, kwargs);
}

// If orig_function is None or one of the built-in types, just return args for further processing
Py_INCREF(args); // Increment reference count before returning
return args;
}

void
pyexport_aspect_helpers(py::module& m);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,8 +150,9 @@ std::pair<TaintRangeRefs, bool>
get_ranges(PyObject* string_input, const TaintRangeMapTypePtr& tx_map)
{
TaintRangeRefs result;
if (not is_tainteable(string_input))
if (not is_tainteable(string_input)) {
return std::make_pair(result, true);
}

if (tx_map->empty()) {
return std::make_pair(result, false);
Expand Down
22 changes: 22 additions & 0 deletions ddtrace/appsec/_iast/_taint_tracking/TaintTracking/TaintRange.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#pragma once
#include <sstream>
#include <utility>

#include <pybind11/stl.h>
Expand Down Expand Up @@ -145,5 +146,26 @@ get_internal_hash(PyObject* obj);
void
set_tainted_object(PyObject* str, TaintedObjectPtr tainted_object, const TaintRangeMapTypePtr& tx_map);

inline void
copy_and_shift_ranges_from_strings(const py::handle& str_1,
const py::handle& str_2,
const int offset,
const int new_length,
const TaintRangeMapTypePtr& tx_map)
{
if (!tx_map)
return;

auto [ranges, ranges_error] = get_ranges(str_1.ptr(), tx_map);
if (ranges_error) {
py::set_error(PyExc_TypeError, MSG_ERROR_TAINT_MAP);
return;
}
if (const bool result = set_ranges(str_2.ptr(), shift_taint_ranges(ranges, offset, new_length), tx_map);
not result) {
py::set_error(PyExc_TypeError, MSG_ERROR_SET_RANGES);
}
}

void
pyexport_taintrange(py::module& m);
Loading

0 comments on commit e36b8ea

Please sign in to comment.