Skip to content

Commit

Permalink
Merge pull request #931 from ronawho/localize-regex-data
Browse files Browse the repository at this point in the history
Optimize how segmented array slices are interpreted as strings/bytes
  • Loading branch information
reuster986 authored Oct 4, 2021
2 parents f6213ad + 70cf722 commit 436a4fc
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 35 deletions.
2 changes: 1 addition & 1 deletion src/CastMsg.chpl
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ module CastMsg {
} else {
end = oa[i+1] - 1;
}
e = interpretAsString(va[start..end]) : toType;
e = interpretAsString(va, start..end) : toType;
}
} catch e: IllegalArgumentError {
var errorMsg = "bad value in cast from string to %s".format(toType:string);
Expand Down
34 changes: 19 additions & 15 deletions src/Flatten.chpl
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
module Flatten {
use ServerConfig;

use AryUtil;
use SegmentedArray;
use ServerErrors;
use SymArrayDmap;
Expand All @@ -13,21 +14,23 @@ module Flatten {
config const NULL_STRINGS_VALUE = 0:uint(8);

/*
Convert a uint(8) array into bytes. Modeled after SegString.interpretAsString
*/
inline proc interpretAsBytes(bytearray: [?D] uint(8)): bytes {
// Byte buffer must be local in order to make a C pointer
var localBytes: [{0..#D.size}] uint(8) = bytearray;
var cBytes = c_ptrTo(localBytes);
// Byte buffer is null-terminated, so length is buffer.size - 1
// The contents of the buffer should be copied out because cBytes will go out of scope
var b: bytes;
Interpret a region of a byte array as bytes. Modeled after interpretAsString
*/
proc interpretAsBytes(bytearray: [?D] uint(8), region: range(?), borrow=false): bytes {
var localSlice = new lowLevelLocalizingSlice(bytearray, region);
// Byte buffer is null-terminated, so length is region.size - 1
try {
b = createBytesWithNewBuffer(cBytes, D.size-1, D.size);
if localSlice.isOwned {
localSlice.isOwned = false;
return createBytesWithOwnedBuffer(localSlice.ptr, region.size-1, region.size);
} else if borrow {
return createBytesWithBorrowedBuffer(localSlice.ptr, region.size-1, region.size);
} else {
return createBytesWithNewBuffer(localSlice.ptr, region.size-1, region.size);
}
} catch {
b = b"<error interpreting uint(8) as bytes>";
return b"<error interpreting uint(8) as bytes>";
}
return b;
}

/*
Expand Down Expand Up @@ -61,23 +64,24 @@ module Flatten {
var writeAgg = newDstAggregator(bool),
var nbAgg = newDstAggregator(bool),
var matchAgg = newDstAggregator(int)) {
var matchessize = 0 ;
// for each string, find delim matches and set the positions of matches in writeToVal to false (non-matches will be copied to flattenedVals)
// mark the locations of null bytes (the positions before original offsets and the last character of matches)
var matches = myRegex.matches(interpretAsBytes(origVals[off..#len]));
for m in matches {
for m in myRegex.matches(interpretAsBytes(origVals, off..#len, borrow=true)) {
var match: reMatch = m[0];
// set writeToVal to false for matches (except the last character of the match because we will write a null byte)
for k in (off + match.offset:int)..#(match.size - 1) {
writeAgg.copy(writeToVal[k], false);
}
// is writeToVal[(off + match.offset:int)..#(match.size - 1)] = false more efficient or for loop with aggregator?
nbAgg.copy(nullByteLocations[off + match.offset:int + (match.size - 1)], true);
matchessize += 1;
}
if off != 0 {
// the position before an offset is a null byte (except for off == 0)
nbAgg.copy(nullByteLocations[off - 1], true);
}
matchAgg.copy(numMatches[i], matches.size);
matchAgg.copy(numMatches[i], matchessize);
}

// writeToVal is true for positions to copy origVals (non-matches) and positions to write a null byte
Expand Down
43 changes: 24 additions & 19 deletions src/SegmentedArray.chpl
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ module SegmentedArray {
end = offsets.a[idx+1] - 1;
}
// Take the slice of the bytearray and "cast" it to a chpl string
var s = interpretAsString(values.a[start..end]);
var s = interpretAsString(values.a, start..end);
return s;
}

Expand Down Expand Up @@ -505,7 +505,7 @@ module SegmentedArray {
var lenAgg = newDstAggregator(int),
var startAgg = newDstAggregator(bool),
var matchAgg = newDstAggregator(int)) {
var matches = myRegex.matches(interpretAsString(origVals[off..#len]));
var matches = myRegex.matches(interpretAsString(origVals, off..#len, borrow=true));
for m in matches {
var match: reMatch = m[0];
lenAgg.copy(sparseLens[off + match.offset:int], match.size);
Expand Down Expand Up @@ -616,18 +616,18 @@ module SegmentedArray {
when SearchMode.contains {
forall (o, l, h) in zip(oa, lengths, hits) with (var myRegex = _unsafeCompileRegex(pattern)) {
// regexp.search searches the receiving string for matches at any offset
h = myRegex.search(interpretAsString(va[o..#l])).matched;
h = myRegex.search(interpretAsString(va, o..#l, borrow=true)).matched;
}
}
when SearchMode.startsWith {
forall (o, l, h) in zip(oa, lengths, hits) with (var myRegex = _unsafeCompileRegex(pattern)) {
// regexp.match only returns a match if the start of the string matches the pattern
h = myRegex.match(interpretAsString(va[o..#l])).matched;
h = myRegex.match(interpretAsString(va, o..#l, borrow=true)).matched;
}
}
when SearchMode.endsWith {
forall (o, l, h) in zip(oa, lengths, hits) with (var myRegex = _unsafeCompileRegex(pattern)) {
var matches = myRegex.matches(interpretAsString(va[o..#l]));
var matches = myRegex.matches(interpretAsString(va, o..#l, borrow=true));
var lastMatch: reMatch = matches[matches.size-1][0];
// h = true iff start(lastMatch) + len(lastMatch) == len(string) (-1 to account for null byte)
h = lastMatch.offset + lastMatch.size == l-1;
Expand All @@ -638,7 +638,7 @@ module SegmentedArray {
// regexp.match only returns a match if the start of the string matches the pattern
// h = true iff len(match) == len(string) (-1 to account for null byte)
// if no match is found reMatch.size returns -1
h = myRegex.match(interpretAsString(va[o..#l])).size == l-1;
h = myRegex.match(interpretAsString(va, o..#l, borrow=true)).size == l-1;
}
}
}
Expand Down Expand Up @@ -745,7 +745,7 @@ module SegmentedArray {
var rightStart: [offsets.aD] int;

forall (o, len, i) in zip(oa, lengths, offsets.aD) with (var myRegex = _unsafeCompileRegex(delimiter)) {
var matches = myRegex.matches(interpretAsString(va[o..#len]));
var matches = myRegex.matches(interpretAsString(va, o..#len, borrow=true));
if matches.size < times {
// not enough occurances of delim, the entire string stays together, and the param args
// determine whether it ends up on the left or right
Expand Down Expand Up @@ -1375,20 +1375,25 @@ module SegmentedArray {
}
}

/* Convert an array of raw bytes into a Chapel string. */
inline proc interpretAsString(bytearray: [?D] uint(8)): string {
// Byte buffer must be local in order to make a C pointer
var localBytes: [{0..#D.size}] uint(8) = bytearray;
var cBytes = c_ptrTo(localBytes);
// Byte buffer is null-terminated, so length is buffer.size - 1
// The contents of the buffer should be copied out because cBytes will go out of scope
// var s = new string(cBytes, D.size-1, D.size, isowned=false, needToCopy=true);
var s: string;
/*
Interpret a region of a byte array as a Chapel string. If `borrow=false` a
new string is returned, otherwise the string borrows memory from the array
(reduces memory allocations if the string isn't needed after array)
*/
proc interpretAsString(bytearray: [?D] uint(8), region: range(?), borrow=false): string {
var localSlice = new lowLevelLocalizingSlice(bytearray, region);
// Byte buffer is null-terminated, so length is region.size - 1
try {
s = createStringWithNewBuffer(cBytes, D.size-1, D.size);
if localSlice.isOwned {
localSlice.isOwned = false;
return createStringWithOwnedBuffer(localSlice.ptr, region.size-1, region.size);
} else if borrow {
return createStringWithBorrowedBuffer(localSlice.ptr, region.size-1, region.size);
} else {
return createStringWithNewBuffer(localSlice.ptr, region.size-1, region.size);
}
} catch {
s = "<error interpreting bytes as string>";
return "<error interpreting bytes as string>";
}
return s;
}
}

0 comments on commit 436a4fc

Please sign in to comment.