Merge pull request #931 from ronawho/localize-regex-data

Optimize how segmented array slices are interpreted as strings/bytes
Bears-R-Us · Oct 4, 2021 · 436a4fc · 436a4fc
2 parents f6213ad + 70cf722
commit 436a4fc
Show file tree

Hide file tree

Showing 3 changed files with 44 additions and 35 deletions.
diff --git a/src/CastMsg.chpl b/src/CastMsg.chpl
@@ -203,7 +203,7 @@ module CastMsg {
               } else {
                    end = oa[i+1] - 1;
               }
-              e = interpretAsString(va[start..end]) : toType;
+              e = interpretAsString(va, start..end) : toType;
           }
       } catch e: IllegalArgumentError {
           var errorMsg = "bad value in cast from string to %s".format(toType:string);

diff --git a/src/Flatten.chpl b/src/Flatten.chpl
@@ -1,6 +1,7 @@
 module Flatten {
   use ServerConfig;
 
+  use AryUtil;
   use SegmentedArray;
   use ServerErrors;
   use SymArrayDmap;
@@ -13,21 +14,23 @@ module Flatten {
   config const NULL_STRINGS_VALUE = 0:uint(8);
 
   /*
-    Convert a uint(8) array into bytes. Modeled after SegString.interpretAsString
-  */
-  inline proc interpretAsBytes(bytearray: [?D] uint(8)): bytes {
-    // Byte buffer must be local in order to make a C pointer
-    var localBytes: [{0..#D.size}] uint(8) = bytearray;
-    var cBytes = c_ptrTo(localBytes);
-    // Byte buffer is null-terminated, so length is buffer.size - 1
-    // The contents of the buffer should be copied out because cBytes will go out of scope
-    var b: bytes;
+     Interpret a region of a byte array as bytes. Modeled after interpretAsString
+   */
+  proc interpretAsBytes(bytearray: [?D] uint(8), region: range(?), borrow=false): bytes {
+    var localSlice = new lowLevelLocalizingSlice(bytearray, region);
+    // Byte buffer is null-terminated, so length is region.size - 1
     try {
-      b = createBytesWithNewBuffer(cBytes, D.size-1, D.size);
+      if localSlice.isOwned {
+        localSlice.isOwned = false;
+        return createBytesWithOwnedBuffer(localSlice.ptr, region.size-1, region.size);
+      } else if borrow {
+        return createBytesWithBorrowedBuffer(localSlice.ptr, region.size-1, region.size);
+      } else {
+        return createBytesWithNewBuffer(localSlice.ptr, region.size-1, region.size);
+      }
     } catch {
-      b = b"<error interpreting uint(8) as bytes>";
+      return b"<error interpreting uint(8) as bytes>";
     }
-    return b;
   }
 
   /*
@@ -61,23 +64,24 @@ module Flatten {
                                                                              var writeAgg = newDstAggregator(bool),
                                                                              var nbAgg = newDstAggregator(bool),
                                                                              var matchAgg = newDstAggregator(int)) {
+      var matchessize = 0 ;
       // for each string, find delim matches and set the positions of matches in writeToVal to false (non-matches will be copied to flattenedVals)
       // mark the locations of null bytes (the positions before original offsets and the last character of matches)
-      var matches = myRegex.matches(interpretAsBytes(origVals[off..#len]));
-      for m in matches {
+      for m in myRegex.matches(interpretAsBytes(origVals, off..#len, borrow=true)) {
         var match: reMatch = m[0];
         // set writeToVal to false for matches (except the last character of the match because we will write a null byte)
         for k in (off + match.offset:int)..#(match.size - 1) {
           writeAgg.copy(writeToVal[k], false);
         }
         // is writeToVal[(off + match.offset:int)..#(match.size - 1)] = false more efficient or for loop with aggregator?
         nbAgg.copy(nullByteLocations[off + match.offset:int + (match.size - 1)], true);
+        matchessize += 1;
       }
       if off != 0 {
         // the position before an offset is a null byte (except for off == 0)
         nbAgg.copy(nullByteLocations[off - 1], true);
       }
-      matchAgg.copy(numMatches[i], matches.size);
+      matchAgg.copy(numMatches[i], matchessize);
     }
 
     // writeToVal is true for positions to copy origVals (non-matches) and positions to write a null byte

diff --git a/src/SegmentedArray.chpl b/src/SegmentedArray.chpl
@@ -148,7 +148,7 @@ module SegmentedArray {
         end = offsets.a[idx+1] - 1;
       }
       // Take the slice of the bytearray and "cast" it to a chpl string
-      var s = interpretAsString(values.a[start..end]);
+      var s = interpretAsString(values.a, start..end);
       return s;
     }
 
@@ -505,7 +505,7 @@ module SegmentedArray {
                                                                                var lenAgg = newDstAggregator(int),
                                                                                var startAgg = newDstAggregator(bool),
                                                                                var matchAgg = newDstAggregator(int)) {
-        var matches = myRegex.matches(interpretAsString(origVals[off..#len]));
+        var matches = myRegex.matches(interpretAsString(origVals, off..#len, borrow=true));
         for m in matches {
           var match: reMatch = m[0];
           lenAgg.copy(sparseLens[off + match.offset:int], match.size);
@@ -616,18 +616,18 @@ module SegmentedArray {
         when SearchMode.contains {
           forall (o, l, h) in zip(oa, lengths, hits) with (var myRegex = _unsafeCompileRegex(pattern)) {
             // regexp.search searches the receiving string for matches at any offset
-            h = myRegex.search(interpretAsString(va[o..#l])).matched;
+            h = myRegex.search(interpretAsString(va, o..#l, borrow=true)).matched;
           }
         }
         when SearchMode.startsWith {
           forall (o, l, h) in zip(oa, lengths, hits) with (var myRegex = _unsafeCompileRegex(pattern)) {
             // regexp.match only returns a match if the start of the string matches the pattern
-            h = myRegex.match(interpretAsString(va[o..#l])).matched;
+            h = myRegex.match(interpretAsString(va, o..#l, borrow=true)).matched;
           }
         }
         when SearchMode.endsWith {
           forall (o, l, h) in zip(oa, lengths, hits) with (var myRegex = _unsafeCompileRegex(pattern)) {
-            var matches = myRegex.matches(interpretAsString(va[o..#l]));
+            var matches = myRegex.matches(interpretAsString(va, o..#l, borrow=true));
             var lastMatch: reMatch = matches[matches.size-1][0];
             // h = true iff start(lastMatch) + len(lastMatch) == len(string) (-1 to account for null byte)
             h = lastMatch.offset + lastMatch.size == l-1;
@@ -638,7 +638,7 @@ module SegmentedArray {
             // regexp.match only returns a match if the start of the string matches the pattern
             // h = true iff len(match) == len(string) (-1 to account for null byte)
             // if no match is found reMatch.size returns -1
-            h = myRegex.match(interpretAsString(va[o..#l])).size == l-1;
+            h = myRegex.match(interpretAsString(va, o..#l, borrow=true)).size == l-1;
           }
         }
       }
@@ -745,7 +745,7 @@ module SegmentedArray {
       var rightStart: [offsets.aD] int;
 
       forall (o, len, i) in zip(oa, lengths, offsets.aD) with (var myRegex = _unsafeCompileRegex(delimiter)) {
-        var matches = myRegex.matches(interpretAsString(va[o..#len]));
+        var matches = myRegex.matches(interpretAsString(va, o..#len, borrow=true));
         if matches.size < times {
           // not enough occurances of delim, the entire string stays together, and the param args
           // determine whether it ends up on the left or right
@@ -1375,20 +1375,25 @@ module SegmentedArray {
     }
   }
 
-  /* Convert an array of raw bytes into a Chapel string. */
-  inline proc interpretAsString(bytearray: [?D] uint(8)): string {
-    // Byte buffer must be local in order to make a C pointer
-    var localBytes: [{0..#D.size}] uint(8) = bytearray;
-    var cBytes = c_ptrTo(localBytes);
-    // Byte buffer is null-terminated, so length is buffer.size - 1
-    // The contents of the buffer should be copied out because cBytes will go out of scope
-    // var s = new string(cBytes, D.size-1, D.size, isowned=false, needToCopy=true);
-    var s: string;
+  /*
+     Interpret a region of a byte array as a Chapel string. If `borrow=false` a
+     new string is returned, otherwise the string borrows memory from the array
+     (reduces memory allocations if the string isn't needed after array)
+   */
+  proc interpretAsString(bytearray: [?D] uint(8), region: range(?), borrow=false): string {
+    var localSlice = new lowLevelLocalizingSlice(bytearray, region);
+    // Byte buffer is null-terminated, so length is region.size - 1
     try {
-      s = createStringWithNewBuffer(cBytes, D.size-1, D.size);
+      if localSlice.isOwned {
+        localSlice.isOwned = false;
+        return createStringWithOwnedBuffer(localSlice.ptr, region.size-1, region.size);
+      } else if borrow {
+        return createStringWithBorrowedBuffer(localSlice.ptr, region.size-1, region.size);
+      } else {
+        return createStringWithNewBuffer(localSlice.ptr, region.size-1, region.size);
+      }
     } catch {
-      s = "<error interpreting bytes as string>";
+      return "<error interpreting bytes as string>";
     }
-    return s;
   }
 }