Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

dynamic aggregation of descriptors #92

Merged
merged 29 commits into from
Nov 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
6f0f44a
Added field to descriptor identifying its source
lucaro Aug 1, 2024
09bfcbc
Merge branch 'dev' into feature/dynamic-aggregation
lucaro Aug 8, 2024
80d97b3
Added name field to Extractor
lucaro Aug 8, 2024
621d150
Revert "Added name field to Extractor"
lucaro Aug 13, 2024
f817e12
Name is now consistently passed to Extractor and linked to descriptor…
lucaro Aug 13, 2024
1e3f1d2
Merge remote-tracking branch 'origin/dev' into feature/dynamic-aggreg…
lucaro Aug 13, 2024
e531ad6
Added DescriptorDistanceSegmenter
lucaro Aug 13, 2024
ff02a6e
Fixed issue in FixedDurationSegmenter
lucaro Aug 13, 2024
dcd2f3a
Added VectorDescriptorAggregator
lucaro Aug 14, 2024
8c857fd
Added facility to assign existing descriptors to a field
lucaro Aug 15, 2024
42f0c45
Merge remote-tracking branch 'origin/dev' into feature/dynamic-aggreg…
lucaro Aug 15, 2024
9a57907
Merge branch 'dev' into feature/dynamic-aggregation
lucaro Aug 21, 2024
4e6ad96
Fixed some compilation errors
lucaro Aug 21, 2024
6cd74fb
Merge branch 'dev' into feature/dynamic-aggregation
ppanopticon Aug 23, 2024
f31823c
Moved name field up to Operator
lucaro Aug 23, 2024
f09db70
Merge branch 'feature/dynamic-aggregation' of https://github.com/vitr…
lucaro Aug 23, 2024
7fd87b5
Merge remote-tracking branch 'origin/dev' into feature/dynamic-aggreg…
lucaro Aug 23, 2024
a2ca74a
VectorDescriptorAggregator uses Double instead of Float
lucaro Aug 23, 2024
a474b41
Merge branch 'dev' into feature/dynamic-aggregation
lucaro Oct 21, 2024
9550a50
Minor fix in test
lucaro Oct 21, 2024
44e179b
Split DescriptorFieldMapper into type-specific implementations
lucaro Oct 23, 2024
7d6321b
Minor fixes
lucaro Oct 23, 2024
2b55f9e
Fixed Cosine distance
lucaro Oct 23, 2024
e7e5917
Added relationship re-mapping mechanism
lucaro Oct 23, 2024
12da9fb
Fixed relationships also in FixedDurationSegmenter
lucaro Oct 23, 2024
e56160d
Some fixes in handling of relations and time attributes
lucaro Oct 24, 2024
8b17c16
Fixed issue with changing sources in DescriptionDistanceSegmenter
lucaro Oct 24, 2024
8473cc8
Fixed trailing retrievables being lost
lucaro Oct 24, 2024
275669d
Merge remote-tracking branch 'origin/dev' into feature/dynamic-aggreg…
lucaro Nov 8, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 0 additions & 120 deletions config-ingestion.json

This file was deleted.

66 changes: 0 additions & 66 deletions config-schema.json

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ import org.vitrivr.engine.core.model.descriptor.Descriptor
import org.vitrivr.engine.core.model.metamodel.Analyser
import org.vitrivr.engine.core.model.metamodel.Schema
import org.vitrivr.engine.core.model.retrievable.Retrievable
import org.vitrivr.engine.core.model.retrievable.attributes.CONTENT_AUTHORS_KEY
import org.vitrivr.engine.core.model.retrievable.attributes.ContentAuthorAttribute
import org.vitrivr.engine.core.model.retrievable.attributes.DescriptorAuthorAttribute
import org.vitrivr.engine.core.operators.Operator
import org.vitrivr.engine.core.operators.ingest.Extractor

Expand All @@ -24,7 +24,37 @@ import org.vitrivr.engine.core.operators.ingest.Extractor
* @author Ralph Gasser
* @version 1.0.0
*/
abstract class AbstractBatchedExtractor<C : ContentElement<*>, D : Descriptor<*>>(final override val input: Operator<Retrievable>, final override val analyser: Analyser<C, D>, final override val field: Schema.Field<C, D>?, protected val parameters: Map<String, String>) : Extractor<C, D> {
abstract class AbstractBatchedExtractor<C : ContentElement<*>, D : Descriptor<*>>
private constructor(
final override val input: Operator<Retrievable>,
final override val analyser: Analyser<C, D>,
final override val field: Schema.Field<C, D>? = null,
protected val contentSources : Set<String>? = null,
final override val name: String,
private val bufferSize: Int
) :
Extractor<C, D> {

constructor(input: Operator<Retrievable>, analyser: Analyser<C, D>, contentSources : Set<String>?, field: Schema.Field<C, D>, bufferSize: Int = 100) : this(
input,
analyser,
field,
contentSources,
field.fieldName,
bufferSize
)

constructor(input: Operator<Retrievable>, analyser: Analyser<C, D>, contentSources : Set<String>?, name: String, bufferSize: Int = 100) : this(
input,
analyser,
null,
contentSources,
name,
bufferSize
)




companion object {
const val BATCH_SIZE_KEY = "batchSize"
Expand All @@ -37,13 +67,6 @@ abstract class AbstractBatchedExtractor<C : ContentElement<*>, D : Descriptor<*>
/** The [KLogger] instance used by this [AbstractExtractor]. */
protected val logger: KLogger = KotlinLogging.logger {}

/** The names of the content source to consider during processing. */
protected val contentSources : Set<String>?
ppanopticon marked this conversation as resolved.
Show resolved Hide resolved
get() = this.parameters[CONTENT_AUTHORS_KEY]?.split(",")?.toSet()

/** The buffer- and batch size. */
private val bufferSize : Int
get() = this.parameters[BATCH_SIZE_KEY]?.toIntOrNull() ?: 1

/**
* A default [Extractor] implementation for batched extraction. It executes the following steps:
Expand Down Expand Up @@ -71,9 +94,12 @@ abstract class AbstractBatchedExtractor<C : ContentElement<*>, D : Descriptor<*>
logger.debug { "Batch size reached for field ${field?.fieldName}, extracting descriptors" }
val descriptors = extract(batch)
batch.forEachIndexed { i, r ->
val sourceAttribute = DescriptorAuthorAttribute()
descriptors[i].forEach { d ->
r.addDescriptor(d)
sourceAttribute.add(d, name)
}
r.addAttribute(sourceAttribute)
}
emitAll(batch.asFlow())
batch.clear()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ import org.vitrivr.engine.core.model.descriptor.Descriptor
import org.vitrivr.engine.core.model.metamodel.Analyser
import org.vitrivr.engine.core.model.metamodel.Schema
import org.vitrivr.engine.core.model.retrievable.Retrievable
import org.vitrivr.engine.core.model.retrievable.attributes.CONTENT_AUTHORS_KEY
import org.vitrivr.engine.core.model.retrievable.attributes.ContentAuthorAttribute
import org.vitrivr.engine.core.model.retrievable.attributes.DescriptorAuthorAttribute
import org.vitrivr.engine.core.operators.Operator
import org.vitrivr.engine.core.operators.ingest.Extractor

Expand All @@ -21,7 +21,29 @@ import org.vitrivr.engine.core.operators.ingest.Extractor
* @author Ralph Gasser
* @version 1.3.0
*/
abstract class AbstractExtractor<C : ContentElement<*>, D : Descriptor<*>>(final override val input: Operator<Retrievable>, final override val analyser: Analyser<C, D>, final override val field: Schema.Field<C, D>? = null, protected val parameters: Map<String, String>) : Extractor<C, D> {
abstract class AbstractExtractor<C : ContentElement<*>, D : Descriptor<*>> private constructor(
final override val input: Operator<Retrievable>,
final override val analyser: Analyser<C, D>,
final override val field: Schema.Field<C, D>? = null,
protected val contentSources : Set<String>? = null,
final override val name: String
) : Extractor<C, D> {

constructor(input: Operator<Retrievable>, analyser: Analyser<C, D>, contentSources : Set<String>?, field: Schema.Field<C, D>) : this(
input,
analyser,
field,
contentSources,
field.fieldName
)

constructor(input: Operator<Retrievable>, analyser: Analyser<C, D>, contentSources : Set<String>?, name: String) : this(
input,
analyser,
null,
contentSources,
name
)

init {
require(field == null || this.field.analyser == this.analyser) { "Field and analyser do not match! This is a programmer's error!" }
Expand All @@ -30,10 +52,6 @@ abstract class AbstractExtractor<C : ContentElement<*>, D : Descriptor<*>>(final
/** The [KLogger] instance used by this [AbstractExtractor]. */
protected val logger: KLogger = KotlinLogging.logger {}

/** The names of the content source to consider during processing. */
protected val contentSources : Set<String>?
get() = this.parameters[CONTENT_AUTHORS_KEY]?.split(",")?.toSet()

/**
* A default [Extractor] implementation. It executes the following steps:
*
Expand All @@ -43,24 +61,31 @@ abstract class AbstractExtractor<C : ContentElement<*>, D : Descriptor<*>>(final
*
* @return [Flow] of [Retrievable]
*/
override fun toFlow(scope: CoroutineScope): Flow<Retrievable> = this.input.toFlow(scope).onEach { retrievable ->
if (this.matches(retrievable)) {
/* Perform extraction. */
val descriptors = try {
logger.debug{"Extraction on field ${field?.fieldName} for retrievable: $retrievable" }
extract(retrievable)
} catch (e: Throwable) {
logger.error(e) { "Error during extraction of $retrievable" }
emptyList()
}
override fun toFlow(scope: CoroutineScope): Flow<Retrievable> =
this.input.toFlow(scope).onEach { retrievable ->
if (this.matches(retrievable)) {
/* Perform extraction. */
val descriptors = try {
logger.debug{"Extraction on field ${field?.fieldName} for retrievable: $retrievable" }
extract(retrievable)
} catch (e: Throwable) {
logger.error(e) { "Error during extraction of $retrievable" }
emptyList()
}

if (descriptors.isNotEmpty()) {
/* Append descriptor. */
logger.trace { "Extracted descriptors for retrievable ($retrievable): $descriptors" }
val authorAttribute = DescriptorAuthorAttribute()
for (d in descriptors) {
retrievable.addDescriptor(d)
authorAttribute.add(d, this.name)
}
retrievable.addAttribute(authorAttribute)
}

/* Append descriptor. */
logger.trace { "Extracted descriptors for retrievable ($retrievable): $descriptors" }
for (d in descriptors) {
retrievable.addDescriptor(d)
}
}
}

/**
* Internal method to check, if [Retrievable] matches this [Extractor] and should thus be processed.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import org.vitrivr.engine.core.model.metamodel.Schema
import org.vitrivr.engine.core.model.query.Query
import org.vitrivr.engine.core.model.query.proximity.ProximityQuery
import org.vitrivr.engine.core.model.retrievable.Retrievable
import org.vitrivr.engine.core.model.retrievable.attributes.CONTENT_AUTHORS_KEY
import org.vitrivr.engine.core.model.types.Value
import org.vitrivr.engine.core.operators.Operator
import org.vitrivr.engine.core.operators.ingest.Extractor
Expand Down Expand Up @@ -52,7 +53,7 @@ class AverageColor : Analyser<ImageContent, FloatVectorDescriptor> {
* @return A new [Extractor] instance for this [Analyser]
* @throws [UnsupportedOperationException], if this [Analyser] does not support the creation of an [Extractor] instance.
*/
override fun newExtractor(field: Schema.Field<ImageContent, FloatVectorDescriptor>, input: Operator<Retrievable>, context: IndexContext) = AverageColorExtractor(input, this, field, merge(field, context))
override fun newExtractor(field: Schema.Field<ImageContent, FloatVectorDescriptor>, input: Operator<Retrievable>, context: IndexContext) = AverageColorExtractor(input, this, context[field.fieldName, CONTENT_AUTHORS_KEY]?.split(",")?.toSet(), field)

/**
* Generates and returns a new [AverageColorExtractor] instance for this [AverageColor].
Expand All @@ -64,7 +65,7 @@ class AverageColor : Analyser<ImageContent, FloatVectorDescriptor> {
* @return A new [Extractor] instance for this [Analyser]
* @throws [UnsupportedOperationException], if this [Analyser] does not support the creation of an [Extractor] instance.
*/
override fun newExtractor(name: String, input: Operator<Retrievable>, context: IndexContext): Extractor<ImageContent, FloatVectorDescriptor> = AverageColorExtractor(input, this, null, context.local[name] ?: emptyMap())
override fun newExtractor(name: String, input: Operator<Retrievable>, context: IndexContext): Extractor<ImageContent, FloatVectorDescriptor> = AverageColorExtractor(input, this, context[name, CONTENT_AUTHORS_KEY]?.split(",")?.toSet(), name)

/**
* Generates and returns a new [DenseRetriever] instance for this [AverageColor].
Expand Down
Loading