Skip to content

Commit

Permalink
Add CLI option to use treeReduce on Spark
Browse files Browse the repository at this point in the history
  • Loading branch information
michaelmior committed Oct 6, 2024
1 parent 2b04dc2 commit 9fdfbc6
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 2 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]
### Added
- Add CLI option to use `treeReduce` on Spark

## [0.40.0] - 2024-08-22
### Added
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ private final case class Config(
propertySet: PropertySet = PropertySets.AllProperties,
addDefinitions: Boolean = false,
detectDynamic: Boolean = false,
detectDisjoint: Boolean = false
detectDisjoint: Boolean = false,
treeReduce: Boolean = false
)

object JsonoidSpark {
Expand Down Expand Up @@ -60,6 +61,10 @@ object JsonoidSpark {
opt[Unit]('j', "detect-disjoint")
.action((x, c) => c.copy(detectDisjoint = true))
.text("detect objects with disjoint keys")

opt[Unit]('t', "tree-reduce")
.action((x, c) => c.copy(treeReduce = true))
.text("use treeReduce for schema reduction")
}

parser.parse(args, Config()) match {
Expand All @@ -71,8 +76,11 @@ object JsonoidSpark {
val jsonRdd = JsonoidRDD.fromStringRDD(
sc.textFile(config.input)
)(p)
var schema: ObjectSchema =
var schema: ObjectSchema = if (config.treeReduce) {
jsonRdd.treeReduceSchemas().asInstanceOf[ObjectSchema]
} else {
jsonRdd.reduceSchemas().asInstanceOf[ObjectSchema]
}

// Skip transformation if we know the required properties don't exist
if (!(config.propertySet === PropertySets.MinProperties)) {
Expand Down

0 comments on commit 9fdfbc6

Please sign in to comment.