From 0c21143e9558b4df9b0980dc9c927f79ef8af8d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ervin=20Heged=C3=BCs?= Date: Wed, 2 Sep 2020 09:42:32 +0000 Subject: [PATCH] README fix; subject read from STDIN --- CHANGES | 7 ++++++- README.md | 29 ++++++++++++++++++++--------- src/msc2.c | 41 +++++++++++++++++++++++++++++------------ src/msc3.cc | 34 +++++++++++++++++++++++++--------- 4 files changed, 80 insertions(+), 31 deletions(-) diff --git a/CHANGES b/CHANGES index f5c94db..f9117c0 100644 --- a/CHANGES +++ b/CHANGES @@ -1,5 +1,10 @@ -v0.3 - YYYY-MM-DD +v0.4 - YYYY-MM-dd ----------------- + +v0.3 - 2020-09-02 +----------------- + * new feature: read subject from stdin - thanks @dune73 + * README fix: mark 'Other notes' as outdated * refactoring regex.cc for pcre4msc3 * add `-f` argument for pcre4msc3 * aligned v3 behavior to the fix (#2348) diff --git a/README.md b/README.md index 0f3cdf6..fef7327 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,22 @@ -# msc_pcretest +# msc_retest -Welcome to the `msc_pcretest` documentation. +Welcome to the `msc_retest` documentation. Description =========== -This tool compiles two binaries: `pcre4msc2` and `pcre4msc3`. The binaries emulates the behaviors of regex engine (PCRE - the old version) in mod_security2 (Apache module) and the libmodsecurity3. With this programs, you can check the evaulation time and result of every regular expressions with any random (including very extreme long) input. Both of them (regex pattern, input subject) needs to exists in two separated files, and you can pass them as argument. +This tool compiles two binaries: `pcre4msc2` and `pcre4msc3`. The binaries emulates the behaviors of regex engine (PCRE - the old version) in mod_security2 (Apache module) and the libmodsecurity3. With this programs, you can check the evaulation time and result of every regular expressions with any random (including very extreme long) input. Both of them (regex pattern, input subject) needs to exists in two separated files, and you can pass them as argument. Subject can be passed through stdin, if you give '-' for subjectfile, eg: + +```bash +echo "arg=../../../etc/passwd&foo=var" | src/pcre4msc2 data/930110_1.txt - +data/930110_1.txt - time elapsed: 0.000012, match value: SUBJECT MATCHED 1 TIME +``` +or just simple leave it: + +```bash +echo "arg=../../../etc/passwd&foo=var" | src/pcre4msc3 data/930110_1.txt +data/930110_1.txt - time elapsed: 0.000006, match value: SUBJECT MATCHED 1 TIME +``` The source tree contains some extra directories: under the `data/` you can find all of the regular expressions, what CRS uses. The files contains the id of the rule, and a suffix (there are some chained rules, where more parts uses `@rx`). @@ -53,10 +64,10 @@ Both versions uses a precompiled value, mod_security2 uses 30, libmodsecurity3 u These informations are very important to understand, why and how works the tools. -Other notes ------------ +Other notes - outdated +---------------------- -In my opinion, the libmodsecurity3 `@rx` implementation has some design errors: the `searchAll()` method (which is same here as the original code) always collects the captured substrings from subject - this could be make slower the operator. The other thing is there is no limit: if the subject contains the pattern (for example) 100 times, then it will collects all of them, so this coul be lead to high memory usage (of course it depends on pattern and subject - but the collection will stores only the first 10 matches: from TX.0 to TX.9). Another problem is that this method is used for many other places, not just the `@rx` operator, eg. variables, transformations... Just for the fun, I made an own implementation in this code, you can check it with `-f` argument. This argument works only with `pcre4msc3` tool. +The libmodsecurity3 `@rx` implementation had some design errors (which are now [fixed](https://github.com/SpiderLabs/ModSecurity/pull/2348)): the `searchAll()` method (which is same here as the original code) always collected the captured substrings from subject - this could be make slower the operator. The another thing was there was no limit: if the subject contained the pattern (for example) 100 times, then it collected all of them, so this could be lead to high memory usage (of course it depends on pattern and subject - but the collection will stores only the first 10 matches: from TX.0 to TX.9). Another problem was that this method was used for many other places, not just the `@rx` operator, eg. variables, transformations... `pcre4msc3` contains the fixed version, but you can allow the old method with `-f` argument. This argument works only with `pcre4msc3` tool. In case of mod_security2, the engine doesn't collects all matches, only the first one. @@ -84,17 +95,17 @@ Let's see the logs: ``` ModSecurity: Warning. Pattern match "(?:is)" at ARGS:foo. [file "/usr/share/modsecurity-crs/rules/REQUEST-901-INITIALIZATION.conf"] [line "456"] [id "800003"] [msg "is, , , , , , , , , "] [hostname "localhost"] [uri "/"] [unique_id "XonUavaOEga8L3onA0ZYBAAAAAA"] ``` - * Nginx (libmodsecurity3): + * Nginx (libmodsecurity3 - before the [fix](https://github.com/SpiderLabs/ModSecurity/pull/2348)): ``` ModSecurity: Warning. Matched "Operator `Rx' with parameter `(?:is)' against variable `ARGS:foo' (Value: `this is what is this' ) [file "/usr/share/modsecurity-crs/rules/REQUEST-901-INITIALIZATION.conf"] [line "447"] [id "800003"] [rev ""] [msg "is, is, is, is, , , , , , "] [data ""] [severity "0"] [ver ""] [maturity "0"] [accuracy "0"] [hostname "0.0.0.0"] [uri "/"] [unique_id "158609110136.080640"] [ref "o2,2o5,2o13,2o18,2v10,20"], client: ::1, server: _, request: "GET /?foo=this%20is%20what%20is%20this HTTP/1.1", host: "localhost" ``` -As you can see, the `libmodsecurity3` produces the expected result, `mod_security2` doesn't. Try to increase the number if "is" patterns in your query, eg: +As you can see, the `libmodsecurity3` produced the expected result, `mod_security2` doesn't. Try to increase the number if "is" patterns in your query, eg: ``` curl -v 'http://localhost/?foo=this%20is%20what%20is%20this%20is%20is%20is...%20is' ``` -and check the `modsec_debug.log`. As you can see the result is what I described above, `libmodsecurity3` collects all occurrence of matches: +and check the `modsec_debug.log`. As you can see the result is what I described above, `libmodsecurity3` old version collects all occurrence of matches: ``` Added regex subexpression TX.0: is Added regex subexpression TX.1: is diff --git a/src/msc2.c b/src/msc2.c index 1265969..d6328d2 100644 --- a/src/msc2.c +++ b/src/msc2.c @@ -18,7 +18,8 @@ #define FILESIZEMAX 131072 void showhelp(char * name) { - printf("Use: %s [OPTIONS] patternfile subjectfile\n", name); + printf("Use: %s [OPTIONS] patternfile subjectfile\n\n", name); + printf("You can pass subject through stdin, just give the '-' as subjectfile or leave it.\n\n"); printf("OPTIONS:\n"); printf("\t-h\tThis help\n"); #ifdef PCRE_CONFIG_JIT @@ -145,6 +146,7 @@ int main(int argc, char **argv) { int match_limit_recursion = 1000; float time_limit = 0.0; int debuglevel = 0; + char stdinname[] = "-"; FILE *fp; const char * patternfile = NULL, * subjectfile = NULL; @@ -153,7 +155,7 @@ int main(int argc, char **argv) { tval_result.tv_sec = 0; tval_result.tv_usec = 0; - if (argc < 3) { + if (argc < 2) { showhelp(argv[0]); return EXIT_FAILURE; } @@ -231,7 +233,11 @@ int main(int argc, char **argv) { } } - if (patternfile == NULL || subjectfile == NULL) { + if (subjectfile == NULL) { + subjectfile = stdinname; + } + + if (patternfile == NULL) { showhelp(argv[0]); return EXIT_FAILURE; } @@ -259,17 +265,28 @@ int main(int argc, char **argv) { escaped_pattern = strip_slashes(pattern, strlen(pattern)); // read subject - fp = fopen(subjectfile, "r"); - if (fp == NULL) { - fprintf(stderr, "Can't open file: %s\n", subjectfile); - return EXIT_FAILURE; + // if filename was given + if (strcmp(subjectfile, "-") != 0) { + fp = fopen(subjectfile, "r"); + if (fp == NULL) { + fprintf(stderr, "Can't open file: %s\n", subjectfile); + return EXIT_FAILURE; + } + i = 0; + while ((ci = fgetc(fp)) != EOF && i < FILESIZEMAX) { + subject[i++] = ci; + } + subject_length = (int)strlen(subject); + fclose(fp); } - i = 0; - while ((ci = fgetc(fp)) != EOF && i < FILESIZEMAX) { - subject[i++] = ci; + // or read from stdin + else { + i = 0; + while ((ci = getchar()) != '\n' && ci != EOF && i < FILESIZEMAX) { + subject[i++] = ci; + } + subject_length = (int)strlen(subject); } - subject_length = (int)strlen(subject); - fclose(fp); if (i == FILESIZEMAX && ci != EOF) { fprintf (stderr, "File too long: %s\n", subjectfile); return EXIT_FAILURE; diff --git a/src/msc3.cc b/src/msc3.cc index a2cad2e..38029e0 100644 --- a/src/msc3.cc +++ b/src/msc3.cc @@ -9,6 +9,9 @@ void showhelp(char * name) { std::cout << "Use: " << name << " [OPTIONS] patternfile subjectfile" << std::endl; + std::cout << std::endl; + std::cout << "You can pass subject through stdin, just give the '-' as subjectfile or leave it" << std::endl; + std::cout << std::endl; std::cout << "OPTIONS:" << std::endl; std::cout << "\t-h\tThis help" << std::endl; std::cout << "\t-n N\titerate pcre_regex as Nth times. Default value is 1." << std::endl; @@ -29,8 +32,9 @@ int main(int argc, char ** argv) { float time_limit = 0.0; double m_sub = 0.0; int debuglevel = 0; // may be later we can use different level... + char stdinname[] = "-"; - if (argc < 3) { + if (argc < 2) { showhelp(argv[0]); return EXIT_FAILURE; } @@ -87,7 +91,11 @@ int main(int argc, char ** argv) { } } - if (patternfile == NULL || subjectfile == NULL) { + if (subjectfile == NULL) { + subjectfile = stdinname; + } + + if (patternfile == NULL) { showhelp(argv[0]); return EXIT_FAILURE; } @@ -104,15 +112,22 @@ int main(int argc, char ** argv) { debugvalue(debuglevel, std::string("PATTERN"), pattern); - // read subject - std::ifstream subjf(subjectfile); std::string subject; - if (subjf) { - subject.assign((std::istreambuf_iterator(subjf)), - (std::istreambuf_iterator())); + // read subject + // if filename was given + if (strcmp(subjectfile, "-") != 0) { + std::ifstream subjf(subjectfile); + if (subjf) { + subject.assign((std::istreambuf_iterator(subjf)), + (std::istreambuf_iterator())); + } + else { + std::cout << "Can't open file: " << subjectfile << std::endl; + } } + // or read from stdin else { - std::cout << "Can't open file: " << subjectfile << std::endl; + std::getline(std::cin, subject); } debugvalue(debuglevel, std::string("SUBJECT"), subject); @@ -128,15 +143,16 @@ int main(int argc, char ** argv) { clock_t m_start = clock(); if (use_old == false) { re->searchOneMatch(subject, captures); + rc = captures.size(); } else { retval = re->searchAll(subject); + rc = retval.size(); } clock_t m_end = clock(); m_sub = (m_end - m_start) / double(CLOCKS_PER_SEC); // minimal value of re->m_execrc is 0, this means no match // in this case we have to decrease the valur for the correct message - rc = re->m_execrc; if (rc == 0) { rc = -1; }