From bec68436ddeaf6afbedd73f8e0a46f011e84ed19 Mon Sep 17 00:00:00 2001 From: Michael Beckerle Date: Thu, 19 Dec 2024 15:15:06 -0500 Subject: [PATCH] Improve schema compiler performance With this change, schema compilation for Link16 drops from 2+ minutes to 18 seconds. Added instrumentation to compiler which shows that most time (by far, like 80+ percent) is spent in Xerces validating each schema file individually. This is unnecessary. We only need to validate the top level schema to detect UPA errors. Fixed other error causing repeated evaluation of OOLAG LV values. Eliminate isError inside synchronized block Avoids slowdown of Daffodil test suite. DAFFODIL-2965, DAFFODIL-2781 --- .../daffodil/core/compiler/Compiler.scala | 19 ++++- .../daffodil/core/dsom/DFDLSchemaFile.scala | 19 ++++- .../apache/daffodil/core/dsom/SchemaSet.scala | 50 ++++++++--- .../runtime1/SchemaSetRuntime1Mixin.scala | 9 +- .../org/apache/daffodil/japi/Daffodil.scala | 4 +- .../apache/daffodil/example/TestJavaAPI.java | 2 +- .../org/apache/daffodil/lib/oolag/OOLAG.scala | 85 +++++++++++-------- .../daffodil/lib/xml/DaffodilXMLLoader.scala | 40 +++++++-- .../daffodil/example/TestScalaAPI.scala | 2 +- .../section00/general/TestResolver.tdml | 3 +- 10 files changed, 164 insertions(+), 69 deletions(-) diff --git a/daffodil-core/src/main/scala/org/apache/daffodil/core/compiler/Compiler.scala b/daffodil-core/src/main/scala/org/apache/daffodil/core/compiler/Compiler.scala index 44b50ecc7e..78db91234d 100644 --- a/daffodil-core/src/main/scala/org/apache/daffodil/core/compiler/Compiler.scala +++ b/daffodil-core/src/main/scala/org/apache/daffodil/core/compiler/Compiler.scala @@ -144,7 +144,7 @@ final class ProcessorFactory private ( codeGenerator } - override def isError: Boolean = sset.isError + override lazy val isError: Boolean = sset.isError def withDistinguishedRootNode(name: String, namespace: String): ProcessorFactory = { Assert.usage(name ne null) @@ -378,7 +378,13 @@ class Compiler private ( tunables ) } - + // It is tempting to call pf.isError here to drive compilation to completion before we + // return the pf to the caller. + // However, this slows down TDML-based testing in Daffodil substantially by moving + // the entire isError checking pass inside the synchronized block of the Daffodil + // schema compiler. This results in reduced concurrency which substantially slows + // the daffodil test suite. + // pf.isError // don't call this here. Call it outside the synchronized block. pf } @@ -409,9 +415,16 @@ object Compiler { optRootName: Option[String], optRootNamespace: Option[String] ): ProcessorFactory = { - synchronized { + val pf = synchronized { c.compileSourceInternal(schemaSource, optRootName, optRootNamespace) } + // Force all compilation to complete. Called here outside of synchronized block on purpose + // to avoid over-serializing things (which would slow down large test suites like Daffodil's test suite.) + // Note that this requires that the shared data structures which require Daffodil schema compilation to + // be serialized do *not* include the data structures being modified during isError processing (which is + // lots of OOLAG evaluations). + pf.isError + pf } } diff --git a/daffodil-core/src/main/scala/org/apache/daffodil/core/dsom/DFDLSchemaFile.scala b/daffodil-core/src/main/scala/org/apache/daffodil/core/dsom/DFDLSchemaFile.scala index 03ee27fbbe..525a187ea4 100644 --- a/daffodil-core/src/main/scala/org/apache/daffodil/core/dsom/DFDLSchemaFile.scala +++ b/daffodil-core/src/main/scala/org/apache/daffodil/core/dsom/DFDLSchemaFile.scala @@ -205,6 +205,18 @@ final class DFDLSchemaFile( private lazy val errHandler = new DFDLSchemaFileLoadErrorHandler(schemaFileLocation) private lazy val loader = new DaffodilXMLLoader(errHandler) + lazy val isValidAsCompleteDFDLSchema: Boolean = LV('isValidAsCompleteDFDLSchema) { + try { + loader.validateAsCompleteDFDLSchema(schemaSource) + } catch { + case e: org.xml.sax.SAXParseException => + errHandler.error(e) // accumulate with error handler. + } finally { + errHandler.handleLoadErrors(this) + } + true + }.value + lazy val iiXMLSchemaDocument = LV('iiXMLSchemaDocument) { val res = makeXMLSchemaDocument(seenBefore, Some(this)) if (res.isDFDLSchema && sset.shouldValidateDFDLSchemas) { @@ -213,15 +225,16 @@ final class DFDLSchemaFile( // Some things, tests generally, want to turn this validation off. // try { - loader.validateAsDFDLSchema(schemaSource) + loader.validateAsIndividualDFDLSchemaFile(schemaSource) } catch { - // validateAsDFDLSchema doesn't always capture all exceptions in the + // validateAsIndividualDFDLSchemaFile doesn't always capture all exceptions in the // loader's error handler. Even for fatal errors it sometimes // just throws them. case e: org.xml.sax.SAXParseException => errHandler.error(e) // accumulate with error handler. + } finally { + errHandler.handleLoadErrors(this) } - errHandler.handleLoadErrors(this) } res }.value diff --git a/daffodil-core/src/main/scala/org/apache/daffodil/core/dsom/SchemaSet.scala b/daffodil-core/src/main/scala/org/apache/daffodil/core/dsom/SchemaSet.scala index 62bdb796b2..30a06ea47c 100644 --- a/daffodil-core/src/main/scala/org/apache/daffodil/core/dsom/SchemaSet.scala +++ b/daffodil-core/src/main/scala/org/apache/daffodil/core/dsom/SchemaSet.scala @@ -171,21 +171,34 @@ final class SchemaSet private ( lazy val schemaFileList = schemas.map(s => s.uriString) private lazy val isValid: Boolean = { - // - // We use keepGoing here, because we want to gather a validation error, - // suppress further propagation of it, and return false. - // - val isV = OOLAG.keepGoing(false) { - val files = allSchemaFiles - val fileValids = files.map { - _.isValid + if (!shouldValidateDFDLSchemas) + true // pretend it's valid though for some specific tests it may not be + else { + // + // We use keepGoing here, because we want to gather a validation error, + // suppress further propagation of it, and return false. + // + val isEachFileIndividuallyValid = OOLAG.keepGoing(false) { + val files = allSchemaFiles + val fileValids = files.map { + _.isValid + } + val res = fileValids.length > 0 && fileValids.fold(true) { + _ && _ + } + res } - val res = fileValids.length > 0 && fileValids.fold(true) { - _ && _ + val isEntireSchemaValidAsAnXSD: Boolean = OOLAG.keepGoing(false) { + this.root.xmlSchemaDocument.schemaFile + .map { primaryDfdlSchemaFile => + primaryDfdlSchemaFile.isValidAsCompleteDFDLSchema + } + .getOrElse(true) } + + val res = isEachFileIndividuallyValid && isEntireSchemaValidAsAnXSD res } - isV } lazy val validationDiagnostics = { @@ -369,7 +382,7 @@ final class SchemaSet private ( * Or, you can leave the root unspecified, and this method will determine it from the * first element declaration of the first schema file. */ - lazy val root: Root = { + lazy val root: Root = LV('root) { val re: GlobalElementDecl = optPFRootSpec match { case Some(rs) => @@ -393,7 +406,7 @@ final class SchemaSet private ( case _ => Assert.invariantFailed("illegal combination of root element specifications") } re.asRoot - } + }.value /** * Retrieve schema by namespace @@ -670,7 +683,7 @@ final class SchemaSet private ( * and finally the AST objects are checked for errors, which recursively * demands that all other aspects of compilation occur. */ - override def isError: Boolean = { + override lazy val isError: Boolean = { if (!isValid) true else if ( // use keepGoing so we can capture errors and @@ -681,6 +694,15 @@ final class SchemaSet private ( } ) true else { + // we must check for errors here via the super method, as that iterates over + // all the objects evaluating them for any errors in their required evaluations. + // This has to be after everything else that could report an error here (on some + // other object) has been done. + // + // That is to say, if we called super.isError at the top of this method that would + // be incorrect since isValid and areComponentsConstructed above might cause errors + // to be recorded or objects created that this call to super.isError would then not + // take into account. val hasErrors = super.isError if (!hasErrors) { // must be called after compilation is done diff --git a/daffodil-core/src/main/scala/org/apache/daffodil/core/runtime1/SchemaSetRuntime1Mixin.scala b/daffodil-core/src/main/scala/org/apache/daffodil/core/runtime1/SchemaSetRuntime1Mixin.scala index 25a1290f82..4b2ee489ae 100644 --- a/daffodil-core/src/main/scala/org/apache/daffodil/core/runtime1/SchemaSetRuntime1Mixin.scala +++ b/daffodil-core/src/main/scala/org/apache/daffodil/core/runtime1/SchemaSetRuntime1Mixin.scala @@ -48,21 +48,24 @@ trait SchemaSetRuntime1Mixin { }.value lazy val parser = LV('parser) { - val par = if (generateParser) root.document.parser else new NotParsableParser(root.erd) + val par = + if (generateParser) root.document.parser + else new NotParsableParser(root.erd) Processor.initialize(par) par }.value lazy val unparser = LV('unparser) { val unp = - if (generateUnparser) root.document.unparser else new NotUnparsableUnparser(root.erd) + if (generateUnparser) root.document.unparser + else new NotUnparsableUnparser(root.erd) Processor.initialize(unp) unp }.value private lazy val layerRuntimeCompiler = new LayerRuntimeCompiler - lazy val allLayers: Seq[LayerRuntimeData] = LV('allLayers) { + private lazy val allLayers: Seq[LayerRuntimeData] = LV('allLayers) { val lrds: Seq[LayerRuntimeData] = self.allSchemaComponents .collect { case stb: SequenceTermBase if (stb.isLayered) => stb diff --git a/daffodil-japi/src/main/scala/org/apache/daffodil/japi/Daffodil.scala b/daffodil-japi/src/main/scala/org/apache/daffodil/japi/Daffodil.scala index 9e590c8602..4114e91a0d 100644 --- a/daffodil-japi/src/main/scala/org/apache/daffodil/japi/Daffodil.scala +++ b/daffodil-japi/src/main/scala/org/apache/daffodil/japi/Daffodil.scala @@ -298,7 +298,9 @@ class ProcessorFactory private[japi] (private var pf: SProcessorFactory) */ def onPath(path: String) = { val dp = pf.onPath(path).asInstanceOf[SDataProcessor] - new DataProcessor(dp) + val res = new DataProcessor(dp) + res.isError // ensure all errors have been detected before we return the DP + res } /** diff --git a/daffodil-japi/src/test/java/org/apache/daffodil/example/TestJavaAPI.java b/daffodil-japi/src/test/java/org/apache/daffodil/example/TestJavaAPI.java index a42df23e38..0942ef79bc 100644 --- a/daffodil-japi/src/test/java/org/apache/daffodil/example/TestJavaAPI.java +++ b/daffodil-japi/src/test/java/org/apache/daffodil/example/TestJavaAPI.java @@ -1252,7 +1252,7 @@ public void testJavaAPI27() throws IOException { Throwable cause = e.getCause(); assertTrue(cause.toString().contains("Must call isError")); assertTrue(cause.getCause().toString().contains("Schema Definition Error")); - assertTrue(cause.getCause().toString().contains("Cannot resolve the name 'tns:nonExistent'")); + assertTrue(cause.getCause().toString().contains("tns:nonExistent")); } } diff --git a/daffodil-lib/src/main/scala/org/apache/daffodil/lib/oolag/OOLAG.scala b/daffodil-lib/src/main/scala/org/apache/daffodil/lib/oolag/OOLAG.scala index 56b2725b4f..31856bff71 100644 --- a/daffodil-lib/src/main/scala/org/apache/daffodil/lib/oolag/OOLAG.scala +++ b/daffodil-lib/src/main/scala/org/apache/daffodil/lib/oolag/OOLAG.scala @@ -478,16 +478,13 @@ object OOLAG { } /** - * Currently we depend on being able to evaluate these - * repeatedly, and get different answers. - * - * because it forces evaluation of all the requiredEvaluationsAlways(...) + * Forces evaluation of all the requiredEvaluationsAlways(...) * or requiredEvaluationsIfActivated(...) * on all objects first, but that is only for the objects * that have been created and activated at the time this is called. */ - - def isError: Boolean = { + def isError: Boolean = isErrorOnce + private lazy val isErrorOnce: Boolean = { oolagRoot.checkErrors val errorCount = oolagRoot.errors.size errorCount > 0 @@ -511,18 +508,27 @@ object OOLAG { sealed abstract class OOLAGValueBase( val oolagContext: OOLAGHost, nameArg: String, - body: => Any + bodyArg: => Any ) { + // SCHEMA COMPILER PERFORMANCE INSTRUMENTATION + // To get timing on every OOLAG LV, use these 3 lines to define the 'body' lazy var. + // private lazy val bodyOnce = bodyArg + // private lazy val timedBody = TimeTracker.track(name) { bodyOnce } + // private lazy val body = timedBody + // You will also need to call TimeTracker.logTimes() at the end of Compiler.compileSource. + private lazy val body = bodyArg + Assert.usage(oolagContext != null) final lazy val name = nameArg - private var alreadyTriedThis = false + private var errorAlreadyHandled: Maybe[ErrorAlreadyHandled] = Nope + private var alreadyTriedThis: Maybe[AlreadyTried] = Nope protected final def hasValue: Boolean = value_.isDefined private var value_ : Maybe[AnyRef] = Nope - protected final def wasTried = alreadyTriedThis + private final def wasTried = alreadyTriedThis.isDefined // private def warn(th: Diagnostic): Unit = oolagContext.warn(th) private def error(th: Diagnostic): Unit = oolagContext.error(th) @@ -549,9 +555,9 @@ object OOLAG { } } - override def toString = thisThing + override def toString: String = thisThing - protected final def toss(th: Throwable) = { + protected final def toss(th: Throwable): Nothing = { throw th } @@ -603,7 +609,8 @@ object OOLAG { // Typically this will be for a Schema Definition Error // Assert.invariant(hasValue == false) - Assert.invariant(alreadyTriedThis == true) + Assert.invariant(alreadyTriedThis.isDefined) + Assert.invariant(errorAlreadyHandled.isEmpty) Logger.log.trace(" " * indent + s"${thisThing} has no value due to ${e}") error(e) @@ -611,11 +618,15 @@ object OOLAG { // Catch this if you can carry on with more error gathering // from other contexts. Otherwise just let it propagate. // - toss(new ErrorAlreadyHandled(e, this)) + val eah = new ErrorAlreadyHandled(e, this) + errorAlreadyHandled = One(eah) // in theory, saving this doesn't matter. + toss(eah) } case e @ ErrorsNotYetRecorded(diags) => { + Assert.invariant(alreadyTriedThis.isDefined) + Assert.invariant(!hasValue) diags.foreach { error(_) } - toss(new AlreadyTried(this)) + toss(alreadyTriedThis.get) } case th => toss(th) } @@ -639,12 +650,12 @@ object OOLAG { Logger.log.trace(" " * indent + s"LV: ${thisThing} CIRCULAR") toss(c) } - if (alreadyTriedThis) { + if (alreadyTriedThis.isDefined) { Logger.log.trace(" " * indent + s"LV: ${thisThing} was tried and failed") - val e = AlreadyTried(this) + val e = alreadyTriedThis.get toss(e) } - alreadyTriedThis = true + alreadyTriedThis = One(AlreadyTried(this)) Logger.log.trace(" " * indent + s"Evaluating ${thisThing}") } @@ -660,7 +671,7 @@ object OOLAG { oolagContext.currentOVList = oolagContext.currentOVList.tail } - final def hasError = alreadyTriedThis && !hasValue + final def hasError = alreadyTriedThis.isDefined && !hasValue /** * forces the value, then boolean result tells you if @@ -670,7 +681,7 @@ object OOLAG { final def isError = { val res = - if (alreadyTriedThis) !hasValue + if (alreadyTriedThis.isDefined) !hasValue else { try { valueAsAny @@ -687,22 +698,26 @@ object OOLAG { final lazy val valueAsAny: Any = { if (hasValue) value_.get - val res = - try { - oolagBefore - val v = body // good place for a breakpoint - oolagAfterValue(v.asInstanceOf[AnyRef]) - v - } catch { - case npe: NullPointerException => throw npe - case s: scala.util.control.ControlThrowable => throw s - case u: UnsuppressableException => throw u - case e: Error => throw e - case th: Throwable => oolagCatch(th) - } finally { - oolagFinalize - } - res + else { + // egad.... on 2024-12-19 this was always re-evaluating due to missing `else` keyword. + // this should make a substantial difference in schema compilation time. + val res = + try { + oolagBefore + val v = body // good place for a breakpoint + oolagAfterValue(v.asInstanceOf[AnyRef]) + v + } catch { + case npe: NullPointerException => throw npe + case s: scala.util.control.ControlThrowable => throw s + case u: UnsuppressableException => throw u + case e: Error => throw e + case th: Throwable => oolagCatch(th) + } finally { + oolagFinalize + } + res + } } protected lazy val toOptionAny: Option[Any] = { diff --git a/daffodil-lib/src/main/scala/org/apache/daffodil/lib/xml/DaffodilXMLLoader.scala b/daffodil-lib/src/main/scala/org/apache/daffodil/lib/xml/DaffodilXMLLoader.scala index 34eb2fda51..dd876c7369 100644 --- a/daffodil-lib/src/main/scala/org/apache/daffodil/lib/xml/DaffodilXMLLoader.scala +++ b/daffodil-lib/src/main/scala/org/apache/daffodil/lib/xml/DaffodilXMLLoader.scala @@ -542,11 +542,42 @@ class DaffodilXMLLoader(val errorHandler: org.xml.sax.ErrorHandler) sf } + /** + * checks that a DFDL schema file, viewed as an XML document is valid + * relative to the XML Schema for DFDL Schemas. + * + * For example this ensures that you can only have minOccurs and maxOccurs + * on local element declarations and element refs, not on sequence/choice/group. + */ + def validateAsIndividualDFDLSchemaFile(source: DaffodilSchemaSource): Unit = { + // first we load it, with validation explicitly against the + // schema for DFDL Schemas. + try { + load(source, Some(XMLUtils.schemaForDFDLSchemas), addPositionAttributes = true) + } catch { + // fatal errors are thrown. + // validation errors are never fatal. + case e: SAXParseException => { + // Capturing this would be redundant. + // It will already have been passed to the errorHandler.fatalError + // method. + // So it is explicitly ok to just rethrow this exception. + // we don't want to record it again, but we do want to stop with a + // fatal error because the schema was invalid. Daffodil assumes the + // schema is valid all over its code base. + throw e + } + } + } + /** * This loads the DFDL schema as an XML Schema. This will * check many more things (ex: UPA) about the DFDL schema other than * just whether it validates against the XML Schema for DFDL schemas. * + * This should only be called once, for the entire schema, not per file of + * the schema. + * * Unfortunately, we don't have control over how Xerces loads up these schemas * (other than the resolver anyway), so we can't really redirect the way * it issues error messages so that it properly lays blame at say, the schema fragments @@ -562,13 +593,10 @@ class DaffodilXMLLoader(val errorHandler: org.xml.sax.ErrorHandler) * on the XMLUtils.setSecureDefaults, so we don't need to * further check that here. */ - def validateAsDFDLSchema(source: DaffodilSchemaSource): Unit = { - // first we load it, with validation explicitly against the - // schema for DFDL Schemas. + def validateAsCompleteDFDLSchema(source: DaffodilSchemaSource): Boolean = { try { - load(source, Some(XMLUtils.schemaForDFDLSchemas), addPositionAttributes = true) // - // Then we validate explicitly so Xerces can check things + // We validate explicitly so Xerces can check things // such as for UPA violations // val inputSource = source.newInputSource() @@ -584,6 +612,7 @@ class DaffodilXMLLoader(val errorHandler: org.xml.sax.ErrorHandler) } finally { inputSource.getByteStream().close() } + true } catch { // fatal errors are thrown. // validation errors are never fatal. @@ -598,7 +627,6 @@ class DaffodilXMLLoader(val errorHandler: org.xml.sax.ErrorHandler) throw e } } - } // $COVERAGE-OFF$ These three functions should only be used if someone calls one of the diff --git a/daffodil-sapi/src/test/scala/org/apache/daffodil/example/TestScalaAPI.scala b/daffodil-sapi/src/test/scala/org/apache/daffodil/example/TestScalaAPI.scala index 0406672906..2456c474e9 100644 --- a/daffodil-sapi/src/test/scala/org/apache/daffodil/example/TestScalaAPI.scala +++ b/daffodil-sapi/src/test/scala/org/apache/daffodil/example/TestScalaAPI.scala @@ -1234,7 +1234,7 @@ class TestScalaAPI { cause.getCause.toString.contains("Schema Definition Error") ) assertTrue( - cause.getCause.toString.contains("Cannot resolve the name 'tns:nonExistent'") + cause.getCause.toString.contains("tns:nonExistent") ) } } diff --git a/daffodil-test/src/test/resources/org/apache/daffodil/section00/general/TestResolver.tdml b/daffodil-test/src/test/resources/org/apache/daffodil/section00/general/TestResolver.tdml index 7176b7710a..8374ebd193 100644 --- a/daffodil-test/src/test/resources/org/apache/daffodil/section00/general/TestResolver.tdml +++ b/daffodil-test/src/test/resources/org/apache/daffodil/section00/general/TestResolver.tdml @@ -36,9 +36,8 @@ Schema Definition Error - Unable to resolve this/does/not/exist/schema.dfdl.xsd - Schema context: Location in + Include Location