Skip to content

Commit

Permalink
Add calculation of schema entropy
Browse files Browse the repository at this point in the history
  • Loading branch information
michaelmior committed Aug 3, 2023
1 parent 2389190 commit 9307b08
Show file tree
Hide file tree
Showing 8 changed files with 110 additions and 0 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]
- Add calculation of schema entropy

## [0.16.0]
### Added
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,37 @@ final case class ArraySchema(
override def toString: String = {
pretty(render(toJson()))
}

@SuppressWarnings(Array("org.wartremover.warts.OptionPartial"))
override def entropy(implicit p: JsonoidParams): Option[Long] = {
properties.get[ItemTypeProperty].itemType match {
case Left(schema) => {
val minItems =
properties.getOrNone[MinItemsProperty].flatMap(_.minItems)
val maxItems =
properties.getOrNone[MaxItemsProperty].flatMap(_.maxItems)
(minItems, maxItems) match {
case (Some(min), Some(max)) => {
val possibleLengths = max - min + 1
schema.entropy.map(_ * possibleLengths)
}

// We can't calculate entropy for array schemas without length
case _ => None
}
}

case Right(schemas) => {
val entropies = schemas.map(_.entropy)
if (entropies.forall(_.isDefined)) {
// Sum the entropies from each element in the tuple
Some(entropies.map(_.get).sum)
} else {
None
}
}
}
}
}

/** The type of item stored in this array schema.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -601,4 +601,11 @@ trait JsonSchema[T] {
.expandTo(other)
}
}

/** The number of possible types accepted by this schema.
* It must be overridden by subclasses to do anything useful.
*
* @returns the number of types or None if entropy cannot be calculated
*/
def entropy(implicit p: JsonoidParams): Option[Long] = Some(1)
}
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,38 @@ final case class ObjectSchema(

ObjectSchema(newProps)(p)
}

@SuppressWarnings(Array("org.wartremover.warts.OptionPartial"))
override def entropy(implicit p: JsonoidParams): Option[Long] = {
val additionalProperties = properties
.getOrNone[AdditionalPropertiesProperty]
.flatMap(_.overriddenAdditionalProperties)
.getOrElse(p.additionalProperties)

if (additionalProperties) {
// This schema is open so it has infinite entropy
None
} else {
val objectTypes = properties.get[ObjectTypesProperty].objectTypes
val requiredProperties =
properties
.getOrNone[RequiredProperty]
.flatMap(_.required)
.getOrElse(Set.empty)
val propertyEntropy = objectTypes.values.map(_.entropy)

// TODO Entropy is reduced by dependencies

// We can only calculate entropy if it is defined for all keys
if (propertyEntropy.forall(_.isDefined)) {
Some(objectTypes.map { case (key, schema) =>
schema.entropy.get + (if (requiredProperties.contains(key)) 0 else 1)
}.product)
} else {
None
}
}
}
}

/** The types of all keys in an object schema.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,21 @@ final case class ProductSchema(
case None => this
}
}

@SuppressWarnings(Array("org.wartremover.warts.OptionPartial"))
override def entropy(implicit p: JsonoidParams): Option[Long] = {
val schemaTypes = properties.get[ProductSchemaTypesProperty]
val baseEntropy = schemaTypes.baseSchema.entropy
val schemaEntropies = schemaTypes.schemaTypes.map(_.entropy)
if (
baseEntropy.isDefined && schemaTypes.productType === OneOf && schemaEntropies
.forall(_.isDefined)
) {
Some(baseEntropy.get * schemaEntropies.map(_.get).sum)
} else {
None
}
}
}

sealed trait ProductType {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,15 @@ class ArraySchemaSpec extends UnitSpec with ScalaCheckPropertyChecks {
tupleProp.isSubsetOf(arrayProp) shouldBe true
}

it should "calculate entropy for a tuple schema" in {
tupleSchema.entropy shouldBe Some(2)
}

it should "calculate entropy for an array schema" in {
// This is 2 since the array length can be either 1 or 2
arraySchema.entropy shouldBe Some(2)
}

behavior of "MinItemsProperty"

it should "track minimum array length" in {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,17 @@ class ObjectSchemaSpec extends UnitSpec with ScalaCheckPropertyChecks {
}
}

it should "calculate entropy for simple objects" in {
objectSchema.entropy shouldBe Some(4)
}

it should "calculate entropy for nested objects" in {
val nestedSchema1 = ObjectSchema(Map("baz" -> objectSchema))
val nestedSchema2 = ObjectSchema(Map("quux" -> objectSchema))
val nestedSchema = nestedSchema1.merge(nestedSchema2)
nestedSchema.entropy shouldBe Some(25)
}

behavior of "ObjectTypesProperty"

it should "calculate the intersection of properties" in {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -283,4 +283,8 @@ class ProductSchemaSpec extends UnitSpec with ScalaCheckPropertyChecks {
.get
) shouldBe true
}

it should "calculate entropy for a simple product" in {
productSchema1.entropy shouldBe Some(2)
}
}

0 comments on commit 9307b08

Please sign in to comment.