Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update lightgrep scanner for bulk_extractor 2.0 #421

Draft
wants to merge 31 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
43f5303
R - delete lightgrep scanners not ported, edit Makefile.am accordingly
juliapal Apr 11, 2023
287b882
F - skeleton of new lightgrep scanner for BE2.0
juliapal Apr 11, 2023
dc034c8
R - comment out unused phases
juliapal Apr 19, 2023
d638a2a
F!! - initialization of Lightgrep Controller
juliapal Apr 26, 2023
8ecacd1
R - If PHASE_INIT2 happens >1, Get() causes Fsm to be empty on subseq…
juliapal May 2, 2023
ee7b242
F - simplify addUserPatterns for now
juliapal May 2, 2023
7e1eefa
F - make regcomp, gotHit, HitData, and scan work with new skeleton
juliapal May 2, 2023
c629d3a
R - Prog may not be initialized before numPatterns is called
juliapal May 2, 2023
72c69a8
R - fix declarations
juliapal May 2, 2023
99bc030
F - Append lightgrep feature recorder to feature defs
juliapal May 2, 2023
1c1c4d0
Delete unused scanner
juliapal May 4, 2023
4579d9c
give write_buf good args
juliapal May 4, 2023
7553305
Result is positive upon success
juliapal May 4, 2023
a3616e0
F!! enable mulitple patterns passed through CLI
juliapal May 9, 2023
5097c1c
F - add user_files param to addUserPatterns to avoid scanner config a…
juliapal May 9, 2023
92b7f75
F - Handle user files with parsed patterns in lightgrep scanner
juliapal May 9, 2023
c553d2b
Remove unused or commented code
juliapal May 9, 2023
707466b
b - lightgrep should run even if find is disabled
juliapal May 11, 2023
48b9ca3
F - enable histogram for lightgrep
juliapaluch May 11, 2023
e7ac804
get rid of comments & unused code
juliapaluch May 11, 2023
8cdc2f1
F - delete Fsm if not deleted by regcomp
juliapaluch May 11, 2023
bb88ad8
b - don't declare initScan
juliapaluch May 11, 2023
90bb108
F - remove Scanner "global" variable
juliapaluch May 11, 2023
8c143b3
remove unused includes
juliapaluch May 11, 2023
f553e62
F - throw exception when lightgrep fails to parse pattern
juliapaluch May 11, 2023
799221b
better error messages
juliapaluch May 11, 2023
0ca43ec
F - avoid unnecessary repeated heap alloc/dealloc
juliapaluch May 12, 2023
007b9c9
F!! - Reverting use of thread local because it seems to cause a perfo…
juliapaluch May 30, 2023
fb0e42e
F - update scanner name and version
juliapaluch May 30, 2023
5e55041
a - Delete superfluous whitespace
juliapaluch May 30, 2023
16e8eeb
a - formatting, fix inaccurate comments
juliapaluch May 30, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 2 additions & 6 deletions src/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -48,15 +48,11 @@ scan_accts.o: sbuf_flex_scanner.h
scan_email.o: sbuf_flex_scanner.h
scan_gps.o: sbuf_flex_scanner.h

# These scanners are based on Lightbox Technology's lightgrep
# These scanners are based on Stroz Friedberg's lightgrep
lightgrep_scanners = \
pattern_scanner.cpp pattern_scanner.h \
pattern_scanner_utils.cpp pattern_scanner_utils.h \
scan_lightgrep.cpp \
scan_accts_lg.cpp \
scan_base16_lg.cpp \
scan_email_lg.cpp \
scan_gps_lg.cpp
scan_lightgrep.cpp

# scanners_builtin are the scanners that are compiled into the binary

Expand Down
6 changes: 1 addition & 5 deletions src/bulk_extractor_scanners.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,5 @@ SCANNER(zip)


#ifdef HAVE_LIBLIGHTGREP
//SCANNER(accts_lg)
//SCANNER(base16_lg)
//SCANNER(email_lg)
//SCANNER(gps_lg)
//SCANNER(lightgrep)
SCANNER(lightgrep)
#endif
217 changes: 47 additions & 170 deletions src/pattern_scanner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,11 @@
// if liblightgrep isn't present, compiles to nothing
#ifdef HAVE_LIBLIGHTGREP

#include "beregex.h"
#include "histogram.h"
#include "pattern_scanner.h"
#include "scanner_set.h"

#include <lightgrep/api.h>

#include <iostream>
#include <algorithm>
#include <limits>
#include <fstream>

#include <iostream>

#ifdef LGBENCHMARK
#include <chrono>
#endif
Expand All @@ -25,107 +17,69 @@ namespace {
const unsigned int NumDefaultEncodings = 2;
}

bool PatternScanner::handleParseError(const Handler& h, LG_Error* err) const {
cerr << "Parse error on '" << h.RE << "' in " << Name
<< ": " << err->Message << endl;
return false;
}

void PatternScanner::shutdown(const scanner_params&) {
for (vector<const Handler*>::iterator itr(Handlers.begin()); itr != Handlers.end(); ++itr) {
delete *itr;
}
}
/*********************************************************/

LightgrepController::LightgrepController()
: ParsedPattern(lg_create_pattern()), // Reuse the parsed pattern data structure for efficiency
Fsm(lg_create_fsm(1 << 20)), // Reserve space for 1M states in the automaton--will grow if needed
PatternInfo(lg_create_pattern_map(1000)), // Reserve space for 1000 patterns in the pattern map
Fsm(lg_create_fsm(1000, 1 << 20)), // Reserve space for 1M states in the automaton--will grow if needed
Prog(0),
Scanners()
{
}

LightgrepController::~LightgrepController() {
lg_destroy_pattern(ParsedPattern);
lg_destroy_pattern_map(PatternInfo);
lg_destroy_program(Prog);
if (Fsm) {
lg_destroy_fsm(Fsm);
Fsm = 0;
}
}

LightgrepController& LightgrepController::Get() {
// Meyers Singleton. c.f. Effective C++ by Scott Meyers
static LightgrepController controller;
return controller;
}

bool LightgrepController::addScanner(PatternScanner& scanner) {
// Add patterns and handlers from a Scanner to the centralized automaton
LG_Error* lgErr = 0;
bool LightgrepController::addUserPatterns(
PatternScanner& scanner,
const vector<string>& cli_patterns,
const vector<filesystem::path>& user_files) {

unsigned int patBegin = numeric_limits<unsigned int>::max(),
patEnd = 0;
LG_Error *err = 0;
LG_KeyOptions opts;
opts.FixedString = 0;
opts.CaseInsensitive = 0;

int idx = -1;
bool good = true;

// iterate all the scanner's handlers
for (vector<const Handler*>::const_iterator h(scanner.handlers().begin()); h != scanner.handlers().end(); ++h) {
bool good = false;
if (lg_parse_pattern(ParsedPattern, (*h)->RE.c_str(), &(*h)->Options, &lgErr)) { // parse the pattern
for (vector<string>::const_iterator enc((*h)->Encodings.begin()); enc != (*h)->Encodings.end(); ++enc) {
idx = lg_add_pattern(Fsm, PatternInfo, ParsedPattern, enc->c_str(), &lgErr); // add the pattern for each given encoding
if (idx >= 0) {
// add the handler callback to the pattern map, associated with the pattern index
lg_pattern_info(PatternInfo, idx)->UserData = const_cast<void*>(static_cast<const void*>(&((*h)->Callback)));
patBegin = std::min(patBegin, static_cast<unsigned int>(idx));
good = true;
// add patterns from single command-line arguments
for (const auto& itr : cli_patterns) {
if (lg_parse_pattern(ParsedPattern, itr.c_str(), &opts, &err)) {
for (unsigned int i = 0; i < NumDefaultEncodings; ++i) {
if (lg_add_pattern(Fsm, ParsedPattern, DefaultEncodingsCStrings[i], 0, &err) < 0) {
good = false;
break;
}
}

// std::cerr << '\t' << (int)((*h)->Options.FixedString) << '\t' << (int)((*h)->Options.CaseInsensitive) << std::endl;
} else {
good = false;
}
if (!good) {
if (scanner.handleParseError(**h, lgErr)) {
lg_free_error(lgErr);
lgErr = 0;
}
else {
return false;
}
cerr << "Lightgrep error parsing '" << itr.c_str() << "': " << err->Message << endl;
lg_free_error(err);
return false;
}
}
patEnd = lg_pattern_map_size(PatternInfo);
// record the range of this scanner's patterns in the central pattern map
scanner.patternRange() = make_pair(patBegin, patEnd);
Scanners.push_back(&scanner);
return true;
}

/* note: findopts is now part of scanner_set.scanner_config, you need to pass that in here. */
bool LightgrepController::addUserPatterns(PatternScanner& scanner, CallbackFnType* callbackPtr, const FindOpts& user) {
// Add patterns specified as keywords by the user
// Similar to above, but does not have a handler per pattern
unsigned int patBegin = lg_pattern_map_size(PatternInfo),
patEnd = 0;

LG_KeyOptions opts;
opts.FixedString = 0;
opts.CaseInsensitive = 0;

LG_Error *err = 0;

// Add patterns from files
for (vector<string>::const_iterator itr(user.Files.begin()); itr != user.Files.end(); ++itr) {
ifstream file(itr->c_str(), ios::in);
for (const auto& itr : user_files) {
ifstream file(itr.c_str(), ios::in);
if (!file.is_open()) {
cerr << "Could not open pattern file '" << *itr << "'." << endl;
cerr << "Lightgrep scanner could not open pattern file '" << itr.c_str() << "'." << endl;
return false;
}
string contents = string(istreambuf_iterator<char>(file), istreambuf_iterator<char>());

const char* contentsCStr = contents.c_str();
// Add all the patterns from the files in one fell swoop
if (lg_add_pattern_list(Fsm, PatternInfo, contentsCStr, itr->c_str(), DefaultEncodingsCStrings, 2, &opts, &err) < 0) {
if (lg_add_pattern_list(Fsm, contentsCStr, itr.c_str(), DefaultEncodingsCStrings, NumDefaultEncodings, &opts, &err) < 0) {
vector<string> lines;
istringstream input(contents);
string line;
Expand All @@ -135,98 +89,62 @@ bool LightgrepController::addUserPatterns(PatternScanner& scanner, CallbackFnTyp
}
LG_Error* cur(err);
while (cur) {
cerr << "Error in " << *itr << ", line " << cur->Index+1 << ", pattern '" << lines[cur->Index]
cerr << "Lightgrep parsing error in " << itr.c_str() << ", on line " << cur->Index+1 << ", on pattern '" << lines[cur->Index]
<< "': " << cur->Message << endl;
cur = cur->Next;
}
lg_free_error(err);
return false;
}
}
// add patterns from single command-line arguments
for (vector<string>::const_iterator itr(user.Patterns.begin()); itr != user.Patterns.end(); ++itr) {
bool good = false;
if (lg_parse_pattern(ParsedPattern, itr->c_str(), &opts, &err)) {
for (unsigned int i = 0; i < NumDefaultEncodings; ++i) {
if (lg_add_pattern(Fsm, PatternInfo, ParsedPattern, DefaultEncodingsCStrings[i], &err) >= 0) {
good = true;
}
}
}
if (!good) {
cerr << "Error on '" << *itr << "': " << err->Message << endl;
lg_free_error(err);
return false;
}
}
patEnd = lg_pattern_map_size(PatternInfo);
for (unsigned int i = patBegin; i < patEnd; ++i) {
lg_pattern_info(PatternInfo, i)->UserData = const_cast<void*>(static_cast<const void*>(callbackPtr));
}
scanner.patternRange() = make_pair(patBegin, patEnd);
Scanners.push_back(&scanner);

return true;
}

void LightgrepController::regcomp() {
LG_ProgramOptions progOpts;
progOpts.Determinize = 1;
progOpts.DeterminizeDepth = 10;
// Create an optimized, immutable form of the accumulated automaton
Prog = lg_create_program(Fsm, &progOpts);
lg_destroy_fsm(Fsm);
Fsm = 0;

cerr << lg_pattern_map_size(PatternInfo) << " lightgrep patterns, logic size is " << lg_program_size(Prog) << " bytes, " << Scanners.size() << " active scanners" << std::endl;
#ifdef LGBENCHMARK
cerr << "timer second ratio " << chrono::high_resolution_clock::period::num << "/" <<
chrono::high_resolution_clock::period::den << endl;
#endif
}

struct HitData {
// Everything we need for processing a hit
LightgrepController* lgc;
const vector<PatternScanner*>* scannerTable;
const scanner_params* sp;
//const recursion_control_block* rcb;
feature_recorder &recorder;
const sbuf_t &sbuf;
};

void gotHit(void* userData, const LG_SearchHit* hit) {
#ifdef LGBENCHMARK
// no callback, just increment hit counter
++(*static_cast<uint64_t*>(userData));
#else
// trampoline back into LightgrepController::processHit() from the void* userData
HitData* hd(static_cast<HitData*>(userData));
hd->lgc->processHit(*hd->scannerTable, *hit, *hd->sp, *hd->rcb);
HitData* data(reinterpret_cast<HitData*>(userData));
data->recorder.write_buf(data->sbuf, hit->Start, hit->End - hit->Start);
#endif
}

void LightgrepController::scan(const scanner_params& sp, const recursion_control_block &rcb) {
// Scan the sbuf for pattern hits, invoking various scanners' handlers as hits are encountered
void LightgrepController::scan(const scanner_params& sp) {
// Scan the sbuf for pattern hits
if (!Prog) {
// we had no valid patterns, do nothing
return;
}
// First, clone all the scanners so that there's no shared data between threads
vector<PatternScanner*> scannerTable(lg_pattern_map_size(PatternInfo)); // [Keyword Index -> scanner], no ownership
vector<PatternScanner*> scannerList; // ownership list
for (vector<PatternScanner*>::const_iterator itr(Scanners.begin()); itr != Scanners.end(); ++itr) {
PatternScanner *s = (*itr)->clone();
scannerList.push_back(s);
for (unsigned int i = s->patternRange().first; i < s->patternRange().second; ++i) {
scannerTable[i] = s;
}
s->initScan(sp); // let the scanner know we're about to scan an sbuf
}

LG_ContextOptions ctxOpts;
ctxOpts.TraceBegin = 0xffffffffffffffff;
ctxOpts.TraceEnd = 0;

LG_HCONTEXT ctx = lg_create_context(Prog, &ctxOpts); // create a search context; cannot be shared, so local to scan

const sbuf_t &sbuf = sp.sbuf;

HitData callbackInfo = { this, &scannerTable, &sp, &rcb };
const sbuf_t &sbuf = *sp.sbuf;
HitData callbackInfo = { sp.named_feature_recorder("lightgrep"), *sp.sbuf };
void* userData = &callbackInfo;

#ifdef LGBENCHMARK // perform timings of lightgrep search functions only -- no callbacks
Expand All @@ -239,9 +157,9 @@ void LightgrepController::scan(const scanner_params& sp, const recursion_control

// search the sbuf in one go
// the gotHit() function will be invoked for each pattern hit
if (lg_search(ctx, (const char*)sbuf.buf, (const char*)sbuf.buf + sbuf.pagesize, 0, userData, gotHit) < numeric_limits<uint64_t>::max()) {
if (lg_search(ctx, (const char*)sbuf.get_buf(), (const char*)sbuf.get_buf() + sbuf.pagesize, 0, userData, gotHit) < numeric_limits<uint64_t>::max()) {
// resolve potential hits that want data into the sbuf margin, without beginning any new hits
lg_search_resolve(ctx, (const char*)sbuf.buf + sbuf.pagesize, (const char*)sbuf.buf + sbuf.bufsize, sbuf.pagesize, userData, gotHit);
lg_search_resolve(ctx, (const char*)sbuf.get_buf() + sbuf.pagesize, (const char*)sbuf.get_buf() + sbuf.bufsize, sbuf.pagesize, userData, gotHit);
}
// flush any remaining hits; there's no more data
lg_closeout_search(ctx, userData, gotHit);
Expand All @@ -254,54 +172,13 @@ void LightgrepController::scan(const scanner_params& sp, const recursion_control
std::stringstream buf;
buf << " ** Time: " << sbuf.pos0.str() << '\t' << sbuf.pagesize << '\t' << t.count() << '\t' << seconds<< '\t' << hitCount << '\t' << bw << std::endl;
std::cout << buf.str();
// std::cout.flush();
#endif

lg_destroy_context(ctx);

// don't call PatternScanner::shutdown() on these! that only happens on prototypes
for (vector<PatternScanner*>::const_iterator itr(scannerList.begin()); itr != scannerList.end(); ++itr) {
(*itr)->finishScan(sp); // let the scanner know we're done with the sbuf
delete *itr;
}
}

void LightgrepController::processHit(const vector<PatternScanner*>& sTbl, const LG_SearchHit& hit, const scanner_params& sp, const recursion_control_block& rcb) {
// lookup the handler's callback functor in the pattern map, then invoke it
CallbackFnType* cbPtr(static_cast<CallbackFnType*>(lg_pattern_info(PatternInfo, hit.KeywordIndex)->UserData));
((*sTbl[hit.KeywordIndex]).*(*cbPtr))(hit, sp, rcb); // ...yep...
}

unsigned int LightgrepController::numPatterns() const {
return lg_pattern_map_size(PatternInfo);
return Prog ? lg_prog_pattern_count(Prog) : 0;
}

/*********************************************************/

void scan_lg(PatternScanner& scanner, class scanner_params &sp) {
// utility implementation of the normal scan function for a PatternScanner instance
switch (sp.phase) {
case scanner_params::PHASE_STARTUP:
scanner.startup(sp);
break;
case scanner_params::PHASE_INIT:
scanner.init(sp);
if (!LightgrepController::Get().addScanner(scanner)) {
// It's fine for user patterns not to parse, but there's no excuse for a scanner so exit.
cerr << "Aborting. Fix pattern or disable scanner to continue." << endl;
exit(EXIT_FAILURE);
}
break;
case scanner_params::PHASE_SHUTDOWN:
scanner.shutdown(sp);
break;
case scanner_params::PHASE_CLEANUP:
TODO - to something here.
default:
break;
}
}

/*********************************************************/

#endif // HAVE_LIBLIGHTGREP
Loading