Skip to content

Commit

Permalink
chore: add base split-to-ranges helper (#9095)
Browse files Browse the repository at this point in the history
## Description

This adds a `set_ranges_on_splitted` helper C++ function that will make
trivial to implement many functions or methods that split strings like:

- string.split()
- string.rsplit()
- os.path.basename()
- os.path.dirname()
- os.path.split()
- os.path.splitext()
- os.path.splitdrive()
- os.path.splitroot()

And probably a lot others in other modules...

## Checklist

- [X] Change(s) are motivated and described in the PR description
- [X] Testing strategy is described if automated tests are not included
in the PR
- [X] Risks are described (performance impact, potential for breakage,
maintainability)
- [X] Change is maintainable (easy to change, telemetry, documentation)
- [X] [Library release note
guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html)
are followed or label `changelog/no-changelog` is set
- [X] Documentation is included (in-code, generated user docs, [public
corp docs](https://github.com/DataDog/documentation/))
- [X] Backport labels are set (if
[applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting))
- [X] If this PR changes the public interface, I've notified
`@DataDog/apm-tees`.

## Reviewer Checklist

- [x] Title is accurate
- [x] All changes are related to the pull request's stated goal
- [x] Description motivates each change
- [x] Avoids breaking
[API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces)
changes
- [x] Testing strategy adequately addresses listed risks
- [x] Change is maintainable (easy to change, telemetry, documentation)
- [x] Release note makes sense to a user of the library
- [x] Author has acknowledged and discussed the performance implications
of this PR as reported in the benchmarks PR comment
- [x] Backport labels are set in a manner that is consistent with the
[release branch maintenance
policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)

---------

Signed-off-by: Juanjo Alvarez <[email protected]>
Co-authored-by: Alberto Vara <[email protected]>
  • Loading branch information
juanjux and avara1986 authored Apr 28, 2024
1 parent 5e6184c commit bf42804
Show file tree
Hide file tree
Showing 5 changed files with 460 additions and 1 deletion.
103 changes: 103 additions & 0 deletions ddtrace/appsec/_iast/_taint_tracking/Aspects/Helpers.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "Helpers.h"
#include "Initializer/Initializer.h"
#include <algorithm>
#include <ostream>
#include <regex>

Expand Down Expand Up @@ -327,6 +328,87 @@ _convert_escaped_text_to_taint_text(const StrType& taint_escaped_text, TaintRang
return { StrType(result), ranges };
}

/**
* @brief This function takes the ranges of a string splitted (as in string.split or rsplit or os.path.split) and
* applies the ranges of the original string to the splitted parts with updated offsets.
*
* @param source_str: The original string that was splitted.
* @param source_ranges: The ranges of the original string.
* @param split_result: The splitted parts of the original string.
* @param tx_map: The taint map to apply the ranges.
* @param include_separator: If the separator should be included in the splitted parts.
*/
template<class StrType>
bool
set_ranges_on_splitted(const StrType& source_str,
const TaintRangeRefs& source_ranges,
const py::list& split_result,
TaintRangeMapType* tx_map,
bool include_separator)
{
bool some_set = false;

// Some quick shortcuts
if (source_ranges.empty() or py::len(split_result) == 0 or py::len(source_str) == 0 or not tx_map) {
return false;
}

RANGE_START offset = 0;
std::string c_source_str = py::cast<std::string>(source_str);
auto separator_increase = (int)((not include_separator));

for (const auto& item : split_result) {
if (not is_text(item.ptr()) or py::len(item) == 0) {
continue;
}
auto c_item = py::cast<std::string>(item);
TaintRangeRefs item_ranges;

// Find the item in the source_str.
const auto start = static_cast<RANGE_START>(c_source_str.find(c_item, offset));
if (start == -1) {
continue;
}
const auto end = static_cast<RANGE_START>(start + c_item.length());

// Find what source_ranges match these positions and create a new range with the start and len updated.
for (const auto& range : source_ranges) {
auto range_end_abs = range->start + range->length;

if (range->start < end && range_end_abs > start) {
// Create a new range with the updated start
auto new_range_start = std::max(range->start - offset, 0L);
auto new_range_length = std::min(end - start, (range->length - std::max(0L, offset - range->start)));
item_ranges.emplace_back(
initializer->allocate_taint_range(new_range_start, new_range_length, range->source));
}
}
if (not item_ranges.empty()) {
set_ranges(item.ptr(), item_ranges, tx_map);
some_set = true;
}

offset += py::len(item) + separator_increase;
}

return some_set;
}

template<class StrType>
bool
api_set_ranges_on_splitted(const StrType& source_str,
const TaintRangeRefs& source_ranges,
const py::list& split_result,
bool include_separator)
{
TaintRangeMapType* tx_map = initializer->get_tainting_map();
if (not tx_map) {
throw py::value_error(MSG_ERROR_TAINT_MAP);
}

return set_ranges_on_splitted(source_str, source_ranges, split_result, tx_map, include_separator);
}

py::object
parse_params(size_t position,
const char* keyword_name,
Expand All @@ -348,6 +430,27 @@ pyexport_aspect_helpers(py::module& m)
m.def("common_replace", &api_common_replace<py::bytes>, "string_method"_a, "candidate_text"_a);
m.def("common_replace", &api_common_replace<py::str>, "string_method"_a, "candidate_text"_a);
m.def("common_replace", &api_common_replace<py::bytearray>, "string_method"_a, "candidate_text"_a);
m.def("set_ranges_on_splitted",
&api_set_ranges_on_splitted<py::bytes>,
"source_str"_a,
"source_ranges"_a,
"split_result"_a,
// cppcheck-suppress assignBoolToPointer
"include_separator"_a = false);
m.def("set_ranges_on_splitted",
&api_set_ranges_on_splitted<py::str>,
"source_str"_a,
"source_ranges"_a,
"split_result"_a,
// cppcheck-suppress assignBoolToPointer
"include_separator"_a = false);
m.def("set_ranges_on_splitted",
&api_set_ranges_on_splitted<py::bytearray>,
"source_str"_a,
"source_ranges"_a,
"split_result"_a,
// cppcheck-suppress assignBoolToPointer
"include_separator"_a = false);
m.def("_all_as_formatted_evidence",
&_all_as_formatted_evidence<py::str>,
"text"_a,
Expand Down
15 changes: 15 additions & 0 deletions ddtrace/appsec/_iast/_taint_tracking/Aspects/Helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,5 +52,20 @@ template<class StrType>
std::tuple<StrType, TaintRangeRefs>
_convert_escaped_text_to_taint_text(const StrType& taint_escaped_text, TaintRangeRefs ranges_orig);

template<class StrType>
bool
set_ranges_on_splitted(const StrType& source_str,
const TaintRangeRefs& source_ranges,
const py::list& split_result,
TaintRangeMapType* tx_map,
bool include_separator = false);

template<class StrType>
bool
api_set_ranges_on_splitted(const StrType& source_str,
const TaintRangeRefs& source_ranges,
const py::list& split_result,
bool include_separator = false);

void
pyexport_aspect_helpers(py::module& m);
2 changes: 2 additions & 0 deletions ddtrace/appsec/_iast/_taint_tracking/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from ._native.aspect_helpers import as_formatted_evidence
from ._native.aspect_helpers import common_replace
from ._native.aspect_helpers import parse_params
from ._native.aspect_helpers import set_ranges_on_splitted
from ._native.aspect_ospath_join import _aspect_ospathjoin
from ._native.initializer import active_map_addreses_size
from ._native.initializer import create_context
Expand Down Expand Up @@ -84,6 +85,7 @@
"_format_aspect",
"as_formatted_evidence",
"parse_params",
"set_ranges_on_splitted",
"num_objects_tainted",
"debug_taint_map",
"iast_taint_log_error",
Expand Down
2 changes: 1 addition & 1 deletion scripts/cppcheck.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash
set -e

cppcheck --error-exitcode=1 --std=c++17 --language=c++ --force \
cppcheck --inline-suppr --error-exitcode=1 --std=c++17 --language=c++ --force \
$(git ls-files '*.c' '*.cpp' '*.h' '*.hpp' '*.cc' '*.hh' | grep -E -v '^(ddtrace/(vendor|internal)|ddtrace/appsec/_iast/_taint_tracking/_vendor)/')
Loading

0 comments on commit bf42804

Please sign in to comment.