Skip to content

Commit

Permalink
Refactored file reading to re-use io.shiftleft.x2cpg.IOUtils (#54)
Browse files Browse the repository at this point in the history
Also:
 * updated log4j-slf4j-impl version to 2.17.0
 * updated cpg and joern versions
 * updated to SBT 1.6.0
  • Loading branch information
max-leuthaeuser authored Dec 27, 2021
1 parent 61e3b63 commit d57fdc9
Show file tree
Hide file tree
Showing 12 changed files with 114 additions and 194 deletions.
8 changes: 4 additions & 4 deletions build.sbt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
val cpgVersion = "1.3.461"
val joernVersion = "1.1.387"
val cpgVersion = "1.3.474"
val joernVersion = "1.1.407"

val gitCommitString = SettingKey[String]("gitSha")

Expand Down Expand Up @@ -87,9 +87,9 @@ lazy val commonSettings = Seq(
"org.graalvm.js" % "js" % "21.3.0",
"com.github.pathikrit" %% "better-files" % "3.9.1",
"org.slf4j" % "slf4j-api" % "1.7.32",
"org.apache.logging.log4j" % "log4j-slf4j-impl" % "2.16.0" % Runtime,
"org.apache.logging.log4j" % "log4j-slf4j-impl" % "2.17.0" % Runtime,
"com.typesafe.play" %% "play-json" % "2.9.2",
"com.fasterxml.jackson" % "jackson-base" % "2.13.0",
"com.fasterxml.jackson" % "jackson-base" % "2.13.1",
"com.atlassian.sourcemap" % "sourcemap" % "2.0.0",
"commons-io" % "commons-io" % "2.11.0",
"io.shiftleft" %% "semanticcpg" % cpgVersion % Test classifier "tests",
Expand Down
2 changes: 1 addition & 1 deletion project/build.properties
Original file line number Diff line number Diff line change
@@ -1 +1 @@
sbt.version=1.5.7
sbt.version=1.6.0
16 changes: 5 additions & 11 deletions src/main/scala/io/shiftleft/js2cpg/core/Config.scala
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import io.shiftleft.js2cpg.io.FileUtils
import io.shiftleft.js2cpg.parser.PackageJsonParser
import io.shiftleft.js2cpg.preprocessing.TypescriptTranspiler

import scala.util.{Failure, Success, Using}
import scala.util.{Try, Failure, Success}
import scala.util.matching.Regex

object Config {
Expand Down Expand Up @@ -72,16 +72,10 @@ case class Config(srcDir: String = "",

def withLoadedIgnores(): Config = {
val slIngoreFilePath = Paths.get(srcDir, Config.SL_IGNORE_FILE)
val result = Using(FileUtils.bufferedSourceFromFile(slIngoreFilePath)) { bufferedSource =>
val content = FileUtils.contentFromBufferedSource(bufferedSource)
content.split(System.lineSeparator()).toSeq.map(createPathForIgnore)
}

result match {
case Failure(_) =>
this
case Success(loadedIgnoredFiles) =>
this.copy(ignoredFiles = ignoredFiles ++ loadedIgnoredFiles)
Try(FileUtils.readLinesInFile(slIngoreFilePath)) match {
case Failure(_) => this
case Success(lines) =>
this.copy(ignoredFiles = ignoredFiles ++ lines.map(createPathForIgnore))
}
}

Expand Down
36 changes: 17 additions & 19 deletions src/main/scala/io/shiftleft/js2cpg/cpg/passes/AstCreationPass.scala
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ import io.shiftleft.passes.{DiffGraph, IntervalKeyPool, ParallelCpgPass}
import org.slf4j.LoggerFactory
import io.shiftleft.js2cpg.util.SourceWrapper._

import scala.util.{Failure, Success, Try, Using}
import scala.util.{Failure, Success, Try}

/**
* Given a list of filenames, this pass creates the abstract syntax tree and CPG AST for each file.
Expand Down Expand Up @@ -88,24 +88,22 @@ class AstCreationPass(srcDir: File,
}

private def parse(path: Path, rootDir: Path): Try[ParseResult] = {
Using(FileUtils.bufferedSourceFromFile(path)) { bufferedSource =>
val relPath = rootDir.relativize(path).toString

val fileStatistics = JsFileChecks.check(relPath, bufferedSource.reset())

val source = Source
.sourceFor(relPath, FileUtils.contentFromBufferedSource(bufferedSource))
val jsSource = source.toJsSource(srcDir, rootDir)

logger.debug(s"Parsing file '$relPath'.")
Try(JavaScriptParser.parseFromSource(jsSource)) match {
case Failure(exception) =>
report.addReportInfo(jsSource.originalFilePath, fileStatistics.linesOfCode)
throw exception
case Success((ast, jsSource)) =>
report.addReportInfo(jsSource.originalFilePath, fileStatistics.linesOfCode, parsed = true)
ParseResult(File(path), jsSource, ast)
}
val lines = FileUtils.readLinesInFile(path)
val relPath = rootDir.relativize(path).toString

val fileStatistics = JsFileChecks.check(relPath, lines)

val source = Source.sourceFor(relPath, lines.mkString("\n"))
val jsSource = source.toJsSource(srcDir, rootDir)

logger.debug(s"Parsing file '$relPath'.")
Try(JavaScriptParser.parseFromSource(jsSource)) match {
case Failure(exception) =>
report.addReportInfo(jsSource.originalFilePath, fileStatistics.linesOfCode)
throw exception
case Success((ast, jsSource)) =>
report.addReportInfo(jsSource.originalFilePath, fileStatistics.linesOfCode, parsed = true)
Success(ParseResult(File(path), jsSource, ast))
}
}

Expand Down
13 changes: 6 additions & 7 deletions src/main/scala/io/shiftleft/js2cpg/io/EmScriptenCleaner.scala
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,15 @@ object EmScriptenCleaner {
* If code contains emscripten code (marked with start funcs and end funcs comments)
* we simply replace it with empty lines.
*/
def clean(code: Iterator[String]): Iterator[String] = {
val lines = code.toSeq
val startIndex = lines.indexWhere(EMSCRIPTEN_START_FUNCS.matches)
val endIndex = lines.indexWhere(EMSCRIPTEN_END_FUNCS.matches)
def clean(code: Seq[String]): Iterator[String] = {
val startIndex = code.indexWhere(EMSCRIPTEN_START_FUNCS.matches)
val endIndex = code.indexWhere(EMSCRIPTEN_END_FUNCS.matches)
if (startIndex != -1 && endIndex != -1 && endIndex > startIndex) {
(lines.slice(0, startIndex) ++
(code.slice(0, startIndex) ++
Seq.fill(endIndex - startIndex - 1)(System.lineSeparator()) ++
lines.slice(endIndex + 1, lines.length)).iterator
code.slice(endIndex + 1, code.length)).iterator
} else {
lines.iterator
code.iterator
}
}

Expand Down
89 changes: 11 additions & 78 deletions src/main/scala/io/shiftleft/js2cpg/io/FileUtils.scala
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,16 @@ package io.shiftleft.js2cpg.io

import better.files.File

import java.io.Reader
import java.math.BigInteger
import java.nio.charset.{CharsetDecoder, CodingErrorAction}
import java.nio.file.{FileVisitResult, Files, Path, SimpleFileVisitor}
import java.nio.file.{Files, FileVisitResult, Path, SimpleFileVisitor}
import io.shiftleft.js2cpg.core.Config
import io.shiftleft.js2cpg.io.FileDefaults._
import io.shiftleft.x2cpg.IOUtils
import org.slf4j.LoggerFactory

import java.nio.file.attribute.BasicFileAttributes
import java.security.{DigestInputStream, MessageDigest}
import scala.collection.concurrent.TrieMap
import scala.collection.{SortedMap, mutable}
import scala.io.{BufferedSource, Codec, Source}
import scala.jdk.CollectionConverters._
import scala.collection.{mutable, SortedMap}

object FileUtils {

Expand Down Expand Up @@ -69,28 +65,6 @@ object FileUtils {
}
}

/**
* Creates a new UTF-8 decoder.
* Sadly, instances of CharsetDecoder are not thread-safe as the doc states:
* 'Instances of this class are not safe for use by multiple concurrent threads.'
* (copied from: [[java.nio.charset.CharsetDecoder]])
*
* As we are using it in a [[io.shiftleft.passes.ParallelCpgPass]] it needs to be thread-safe.
* Hence, we make sure to create a new instance everytime.
*/
private def createDecoder(): CharsetDecoder =
Codec.UTF8.decoder
.onMalformedInput(CodingErrorAction.REPLACE)
.onUnmappableCharacter(CodingErrorAction.REPLACE)

private val validUnicodeRegex = """([a-zA-Z0-9]){4}""".r

private val boms = Set(
'\uefbb', // UTF-8
'\ufeff', // UTF-16 (BE)
'\ufffe' // UTF-16 (LE)
)

def getFileTree(rootPath: Path,
config: Config,
extensions: List[String],
Expand Down Expand Up @@ -149,54 +123,13 @@ object FileUtils {
copyTo(from, directory / from.name, config)(copyOptions)
}

def bufferedSourceFromFile(path: Path): BufferedSource = {
Source.fromFile(path.toFile)(createDecoder())
}

private def skipBOMIfPresent(reader: Reader): Unit = {
reader.mark(1)
val possibleBOM = new Array[Char](1)
reader.read(possibleBOM)
if (!boms.contains(possibleBOM(0))) {
reader.reset()
}
}
def readLinesInFile(path: Path): Seq[String] =
EmScriptenCleaner.clean(IOUtils.readLinesInFile(path)).toSeq

private def removeUnpairedSurrogates(input: String): String = {
var result = input
"""(\\u)""".r.findAllMatchIn(input).foreach { pos =>
val matchedString = input.substring(pos.start + 2, pos.start + 6)
if (validUnicodeRegex.matches(matchedString)) {
val c = new BigInteger(matchedString, 16).intValue().asInstanceOf[Char]
if (Character.isLowSurrogate(c) || Character.isHighSurrogate(c)) {
// removing them including leading '\' (needs escapes for backslash itself + for the regex construction)
result = result.replaceAll("(\\\\)*\\\\u" + matchedString, "")
}
}
}
result
}

def contentFromBufferedSource(bufferedSource: BufferedSource): String = {
val reader = bufferedSource.bufferedReader()
skipBOMIfPresent(reader)
EmScriptenCleaner
.clean(reader.lines().iterator().asScala)
.map(removeUnpairedSurrogates)
.mkString("\n")
}

def contentMapFromBufferedSource(bufferedSource: BufferedSource): Map[Int, String] = {
val reader = bufferedSource.bufferedReader()
skipBOMIfPresent(reader)
EmScriptenCleaner
.clean(reader.lines().iterator().asScala)
.zipWithIndex
.map {
case (line, lineNumber) => lineNumber -> removeUnpairedSurrogates(line)
}
.toMap
}
def contentMapFromFile(path: Path): Map[Int, String] =
readLinesInFile(path).zipWithIndex.map {
case (line, lineNumber) => lineNumber -> line
}.toMap

def positionLookupTables(source: String): (SortedMap[Int, Int], SortedMap[Int, Int]) = {
val positionToLineNumber, positionToFirstPositionInLine = mutable.TreeMap.empty[Int, Int]
Expand Down Expand Up @@ -237,12 +170,12 @@ object FileUtils {
* By using Scala BufferedSource we gain a lot of performance as it uses
* a Java PushbackReader and BufferedReader.
*/
def fileStatistics(source: Source): FileStatistics = {
def fileStatistics(lines: Seq[String]): FileStatistics = {
var linesOfCode = 0L
var longestLineLength = 0
var containsMarker = false

for (line <- source.getLines()) {
for (line <- lines) {
val currLength = line.length
if (currLength > longestLineLength) {
longestLineLength = currLength
Expand Down
5 changes: 2 additions & 3 deletions src/main/scala/io/shiftleft/js2cpg/io/JsFileChecks.scala
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ import io.shiftleft.js2cpg.io.FileUtils.FileStatistics
import org.slf4j.LoggerFactory

import scala.collection.mutable
import scala.io.Source

object JsFileChecks {

Expand All @@ -22,8 +21,8 @@ object JsFileChecks {
| You might want to exclude this file when running js2cpg by adding it to '--${Js2cpgArgumentsParser.EXCLUDE}'.""".stripMargin)
}

def check(relPath: String, source: Source): FileStatistics = {
val fileStatistics = FileUtils.fileStatistics(source)
def check(relPath: String, lines: Seq[String]): FileStatistics = {
val fileStatistics = FileUtils.fileStatistics(lines)
val reasons = mutable.ArrayBuffer.empty[String]

// check for very large files (many lines):
Expand Down
52 changes: 23 additions & 29 deletions src/main/scala/io/shiftleft/js2cpg/parser/JsSource.scala
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ import io.shiftleft.js2cpg.preprocessing.NuxtTranspiler
import org.slf4j.LoggerFactory

import scala.jdk.CollectionConverters._
import scala.util.Using

class JsSource(val srcDir: File, val projectDir: Path, val source: Source) {

Expand Down Expand Up @@ -111,37 +110,32 @@ class JsSource(val srcDir: File, val projectDir: Path, val source: Source) {
logger.debug(s"No source map file available for '$originalFilePath'")
None
} else {
Using(FileUtils.bufferedSourceFromFile(Paths.get(mapFilePath))) { sourceMapBuffer =>
val sourceMap =
ReadableSourceMapImpl.fromSource(FileUtils.contentFromBufferedSource(sourceMapBuffer))
val sourceFileNames = sourceMap.getSources.asScala
val sourceMapContent = FileUtils.readLinesInFile(Paths.get(mapFilePath)).mkString("\n")
val sourceMap = ReadableSourceMapImpl.fromSource(sourceMapContent)
val sourceFileNames = sourceMap.getSources.asScala

// The source file might not exist, e.g., if it was the result of transpilation
// but is not delivered and still referenced in the source map
// (fix for: https://github.com/ShiftLeftSecurity/product/issues/4994)
val sourceFile = sourceFileNames
.find(_.toLowerCase.endsWith(File(absoluteFilePath).nameWithoutExtension + VUE_SUFFIX))
.orElse(sourceFileNames.headOption)
// The source file might not exist, e.g., if it was the result of transpilation
// but is not delivered and still referenced in the source map
// (fix for: https://github.com/ShiftLeftSecurity/product/issues/4994)
val sourceFile = sourceFileNames
.find(_.toLowerCase.endsWith(File(absoluteFilePath).nameWithoutExtension + VUE_SUFFIX))
.orElse(sourceFileNames.headOption)

sourceFile.flatMap { sourceFileName =>
val sourceFilePath = constructSourceFilePath(sourceFileName)
if (!sourceFilePath.exists) {
logger.debug(
s"Could not load source map file for '$originalFilePath'. The source map file refers to '$sourceFilePath' but this does not exist")
None
} else {
Using(FileUtils.bufferedSourceFromFile(sourceFilePath.path)) { sourceFileBuffer =>
val sourceFileMapping =
FileUtils.contentMapFromBufferedSource(sourceFileBuffer)
logger.debug(
s"Successfully loaded source map file '$mapFilePath':" +
s"\n\t* Transpiled file: '$absoluteFilePath'" +
s"\n\t* Origin: '$sourceFilePath'")
SourceMapOrigin(sourceFilePath.path, Some(sourceMap), sourceFileMapping)
}.toOption
}
sourceFile.flatMap { sourceFileName =>
val sourceFilePath = constructSourceFilePath(sourceFileName)
if (!sourceFilePath.exists) {
logger.debug(
s"Could not load source map file for '$originalFilePath'. The source map file refers to '$sourceFilePath' but this does not exist")
None
} else {
val sourceFileMapping = FileUtils.contentMapFromFile(Paths.get(mapFilePath))
logger.debug(
s"Successfully loaded source map file '$mapFilePath':" +
s"\n\t* Transpiled file: '$absoluteFilePath'" +
s"\n\t* Origin: '$sourceFilePath'")
Some(SourceMapOrigin(sourceFilePath.path, Some(sourceMap), sourceFileMapping))
}
}.get // safe, as we checked the existence of the sourcemap file already above
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import org.slf4j.LoggerFactory
import play.api.libs.json.Json

import scala.collection.concurrent.TrieMap
import scala.util.Try
import scala.util.Using

object PackageJsonParser {
Expand All @@ -29,11 +30,9 @@ object PackageJsonParser {
val depsPath = packageJsonPath
val lockDepsPath = packageJsonPath.resolveSibling(Paths.get(PACKAGE_JSON_LOCK_FILENAME))

val lockDeps = Using(FileUtils.bufferedSourceFromFile(lockDepsPath)) { bufferedSource =>
val content = FileUtils.contentFromBufferedSource(bufferedSource)
val packageJson = Json.parse(content)

(packageJson \ "dependencies")
val lockDeps = Try {
val content = FileUtils.readLinesInFile(lockDepsPath).mkString("\n")
(Json.parse(content) \ "dependencies")
.asOpt[Map[String, Map[String, String]]]
.map { versions =>
versions.map {
Expand All @@ -44,10 +43,9 @@ object PackageJsonParser {
}.toOption

// lazy val because we only evaluate this in case no package lock file is available.
lazy val deps = Using(FileUtils.bufferedSourceFromFile(depsPath)) { bufferedSource =>
val content = FileUtils.contentFromBufferedSource(bufferedSource)
lazy val deps = Try {
val content = FileUtils.readLinesInFile(depsPath).mkString("\n")
val packageJson = Json.parse(content)

projectDependencies
.flatMap { dependency =>
(packageJson \ dependency).asOpt[Map[String, String]]
Expand Down
Loading

0 comments on commit d57fdc9

Please sign in to comment.