Skip to content

Commit

Permalink
rework calculateSpecificChildrenPaths: (#3601)
Browse files Browse the repository at this point in the history
Rework calculateSpecificChildrenPaths:
split into mode with and without column, simplify both along the distinction.
This fixes a bug where values are faultily resolved to the root.
  • Loading branch information
awildturtok authored Oct 15, 2024
1 parent ee551d4 commit 0b025f4
Show file tree
Hide file tree
Showing 4 changed files with 129 additions and 103 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@
@CPSBase
public interface CTCondition {

public default void init(ConceptTreeNode node) throws ConceptConfigurationException {}
default void init(ConceptTreeNode node) throws ConceptConfigurationException {}

public boolean matches(String value, CalculatedValue<Map<String, Object>> rowMap) throws ConceptConfigurationException;
boolean matches(String value, CalculatedValue<Map<String, Object>> rowMap) throws ConceptConfigurationException;

WhereCondition convertToSqlCondition(CTConditionContext context);

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package com.bakdata.conquery.models.datasets.concepts.conditions;

import java.util.Map;
import jakarta.validation.constraints.NotEmpty;

import com.bakdata.conquery.io.cps.CPSType;
import com.bakdata.conquery.sql.conversion.cqelement.concept.CTConditionContext;
Expand All @@ -11,7 +12,6 @@
import com.bakdata.conquery.util.CalculatedValue;
import com.fasterxml.jackson.annotation.JsonIgnore;
import io.dropwizard.validation.ValidationMethod;
import jakarta.validation.constraints.NotEmpty;
import lombok.Getter;
import lombok.Setter;
import org.jooq.Condition;
Expand All @@ -21,29 +21,34 @@
/**
* This condition requires each value to start with a prefix between the two given values
*/
@CPSType(id="PREFIX_RANGE", base=CTCondition.class)
@CPSType(id = "PREFIX_RANGE", base = CTCondition.class)
public class PrefixRangeCondition implements CTCondition {

private static final String ANY_CHAR_REGEX = ".*";

@Getter @Setter @NotEmpty
@Getter
@Setter
@NotEmpty
private String min;
@Getter @Setter @NotEmpty
@Getter
@Setter
@NotEmpty
private String max;

@ValidationMethod(message="Min and max need to be of the same length and min needs to be smaller than max.") @JsonIgnore

@ValidationMethod(message = "Min and max need to be of the same length and min needs to be smaller than max.")
@JsonIgnore
public boolean isValidMinMax() {
if(min.length()!=max.length()) {
if (min.length() != max.length()) {
return false;
}
return min.compareTo(max)<0;
return min.compareTo(max) < 0;
}

@Override
public boolean matches(String value, CalculatedValue<Map<String, Object>> rowMap) {
if(value.length()>=min.length()) {
String pref = value.substring(0,min.length());
return min.compareTo(pref)<=0 && max.compareTo(pref)>=0;
if (value.length() >= min.length()) {
String pref = value.substring(0, min.length());
return min.compareTo(pref) <= 0 && max.compareTo(pref) >= 0;
}
return false;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,6 @@ public IntFunction<Map<String, Object>> mapCalculator(){
Column[] columns = getTable().resolve().getColumns();

return event -> calculateMap(event, stores, columns);

}

@JsonIgnore
Expand Down
200 changes: 111 additions & 89 deletions backend/src/main/java/com/bakdata/conquery/models/events/CBlock.java
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,10 @@
import com.bakdata.conquery.models.datasets.Table;
import com.bakdata.conquery.models.datasets.concepts.Concept;
import com.bakdata.conquery.models.datasets.concepts.Connector;
import com.bakdata.conquery.models.datasets.concepts.conditions.CTCondition;
import com.bakdata.conquery.models.datasets.concepts.tree.ConceptTreeCache;
import com.bakdata.conquery.models.datasets.concepts.tree.ConceptTreeChild;
import com.bakdata.conquery.models.datasets.concepts.tree.ConceptTreeConnector;
import com.bakdata.conquery.models.datasets.concepts.tree.TreeConcept;
import com.bakdata.conquery.models.events.stores.root.StringStore;
import com.bakdata.conquery.models.exceptions.ConceptConfigurationException;
import com.bakdata.conquery.models.identifiable.IdentifiableImpl;
import com.bakdata.conquery.models.identifiable.ids.NamespacedIdentifiable;
Expand Down Expand Up @@ -108,94 +106,11 @@ public static CBlock createCBlock(ConceptTreeConnector connector, Bucket bucket,
* denoted by the individual {@link ConceptTreeChild#getPrefix()}.
*/
private static int[][] calculateSpecificChildrenPaths(Bucket bucket, ConceptTreeConnector connector) {

final Column column;

final TreeConcept treeConcept = connector.getConcept();

// If we have a column, and it is of string-type, we initialize a cache.
if (connector.getColumn() != null && bucket.getStore(connector.getColumn().resolve()) instanceof StringStore) {

column = connector.getColumn().resolve();

treeConcept.initializeIdCache(bucket.getImp());
}
// No column only possible if we have just one tree element!
else if (treeConcept.countElements() == 1) {
column = null;
if (connector.getColumn() == null) {
return calculateSpecificChildrenPathsWithoutColumn(bucket, connector);
}
else {
throw new IllegalStateException(String.format("Cannot build tree over Connector[%s] without Column", connector.getId()));
}

final CTCondition connectorCondition = connector.getCondition();

final int[][] mostSpecificChildren = new int[bucket.getNumberOfEvents()][];

Arrays.fill(mostSpecificChildren, ConceptTreeConnector.NOT_CONTAINED);

final ConceptTreeCache cache = treeConcept.getCache(bucket.getImp());

IntFunction<Map<String, Object>> mapCalculator = bucket.mapCalculator();

for (int event = 0; event < bucket.getNumberOfEvents(); event++) {


try {
String stringValue = "";

final boolean has = column != null && bucket.has(event, column);

if (column != null && has) {
stringValue = bucket.getString(event, column);
}

// Events can also be filtered, allowing a single table to be used by multiple connectors.
// Lazy evaluation of map to avoid allocations if possible.
// Copy event for closure.
final int _event = event;
final CalculatedValue<Map<String, Object>> rowMap = new CalculatedValue<>(() -> mapCalculator.apply(_event));

if (connectorCondition != null && !connectorCondition.matches(stringValue, rowMap)) {
mostSpecificChildren[event] = Connector.NOT_CONTAINED;
continue;
}

// Events without values are assigned to the root
if (column != null && !has) {
mostSpecificChildren[event] = treeConcept.getPrefix();
continue;
}

final ConceptTreeChild child = cache == null
? treeConcept.findMostSpecificChild(stringValue, rowMap)
: cache.findMostSpecificChild(stringValue, rowMap);

// All unresolved elements resolve to the root.
if (child == null) {
mostSpecificChildren[event] = treeConcept.getPrefix();
continue;
}

// put path into event
mostSpecificChildren[event] = child.getPrefix();
}
catch (ConceptConfigurationException ex) {
log.error("Failed to resolve event {}-{} against concept {}", bucket, event, treeConcept, ex);
}
}

if (cache != null) {
log.trace(
"Hits: {}, Misses: {}, Hits/Misses: {}, %Hits: {} (Up to now)",
cache.getHits(),
cache.getMisses(),
(double) cache.getHits() / cache.getMisses(),
(double) cache.getHits() / (cache.getHits() + cache.getMisses())
);
}

return mostSpecificChildren;
return calculateSpecificChildrenPathsWithColumn(bucket, connector);
}

/**
Expand Down Expand Up @@ -284,6 +199,113 @@ private static Map<String, CDateRange> calculateEntityDateIndices(Bucket bucket)
return spans;
}

private static int[][] calculateSpecificChildrenPathsWithoutColumn(Bucket bucket, Connector connector) {

final int[][] mostSpecificChildren = new int[bucket.getNumberOfEvents()][];

// All elements resolve to the root, unless they are filtered out by the condition.
Arrays.fill(mostSpecificChildren, connector.getConcept().getPrefix());

if (connector.getCondition() == null) {
return mostSpecificChildren;
}

final IntFunction<Map<String, Object>> mapCalculator = bucket.mapCalculator();

// Since the connector has no column, there is no real columnValue.
// All downstream code assumes the presence of a column value, so we just pass an empty string to avoid exceptions.
final String columnValue = "";

for (int event = 0; event < bucket.getNumberOfEvents(); event++) {
try {

// Events can also be filtered, allowing a single table to be used by multiple connectors.
// Lazy evaluation of map to avoid allocations if possible.
// Copy event for closure.
final int _event = event;
final CalculatedValue<Map<String, Object>> rowMap = new CalculatedValue<>(() -> mapCalculator.apply(_event));

if (connector.getCondition().matches(columnValue, rowMap)) {
// by default initialized to the only element, the root.
continue;
}

mostSpecificChildren[event] = Connector.NOT_CONTAINED;
}
catch (ConceptConfigurationException ex) {
log.error("Failed to evaluate event {}, row {} against connector {}", bucket.getId(), event, connector.getId(), ex);
}
}

return mostSpecificChildren;
}

/**
* Calculates the path for each event from the root of the {@link TreeConcept} to the most specific {@link ConceptTreeChild}
* denoted by the individual {@link ConceptTreeChild#getPrefix()}.
*/
private static int[][] calculateSpecificChildrenPathsWithColumn(Bucket bucket, ConceptTreeConnector connector) {

final Column column = connector.getColumn().resolve();

connector.getConcept().initializeIdCache(bucket.getImp());

final ConceptTreeCache cache = connector.getConcept().getCache(bucket.getImp());
final int[] rootPrefix = connector.getConcept().getPrefix();

final IntFunction<Map<String, Object>> mapCalculator = bucket.mapCalculator();

final int[][] mostSpecificChildren = new int[bucket.getNumberOfEvents()][];
Arrays.fill(mostSpecificChildren, ConceptTreeConnector.NOT_CONTAINED);


for (int event = 0; event < bucket.getNumberOfEvents(); event++) {
try {

if (!bucket.has(event, column)) {
continue;
}

final String columnValue = bucket.getString(event, column);

// Events can also be filtered, allowing a single table to be used by multiple connectors.
// Lazy evaluation of map to avoid allocations if possible.
// Copy event for closure.
final int _event = event;
final CalculatedValue<Map<String, Object>> rowMap = new CalculatedValue<>(() -> mapCalculator.apply(_event));

if (connector.getCondition() != null && !connector.getCondition().matches(columnValue, rowMap)) {
continue;
}

final ConceptTreeChild child = cache.findMostSpecificChild(columnValue, rowMap);

// All unresolved elements resolve to the root.
if (child == null) {
mostSpecificChildren[event] = rootPrefix;
continue;
}

mostSpecificChildren[event] = child.getPrefix();
}
catch (ConceptConfigurationException ex) {
log.error("Failed to resolve event {}, row {} against connector {}", bucket.getId(), event, connector.getId(), ex);
}
}


log.trace(
"Hits: {}, Misses: {}, Hits/Misses: {}, %Hits: {} (Up to now)",
cache.getHits(),
cache.getMisses(),
(double) cache.getHits() / cache.getMisses(),
(double) cache.getHits() / (cache.getHits() + cache.getMisses())
);


return mostSpecificChildren;
}

/**
* Calculates the bloom filter from the precomputed path to the most specific {@link ConceptTreeChild}.
*/
Expand Down Expand Up @@ -331,7 +353,7 @@ public boolean isConceptIncluded(String entity, long requiredBits) {
return true;
}

if(!includedConceptElementsPerEntity.containsKey(entity)){
if (!includedConceptElementsPerEntity.containsKey(entity)) {
return false;
}

Expand Down

0 comments on commit 0b025f4

Please sign in to comment.