Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Research: ColumnDataHolder/primitive arrays #712

Draft
wants to merge 19 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions core/build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,19 @@ plugins {

// only mandatory if `kotlin.dataframe.add.ksp=false` in gradle.properties
alias(ksp)

id("org.jetbrains.kotlinx.benchmark") version "0.4.11"
}
idea
}

benchmark {
targets {
register("test") {
}
}
}

group = "org.jetbrains.kotlinx"

val jupyterApiTCRepo: String by project
Expand Down Expand Up @@ -82,6 +91,12 @@ dependencies {
}
testImplementation(libs.kotlin.scriptingJvm)
testImplementation(libs.jsoup)

// testImplementation("org.openjdk.jol:jol-core:0.10")
implementation("org.openjdk.jol:jol-core:0.10")
implementation("it.unimi.dsi:fastutil:8.5.14")
implementation("io.deephaven:deephaven-csv:0.14.0")
testImplementation("org.jetbrains.kotlinx:kotlinx-benchmark-runtime:0.4.11")
}

val samplesImplementation by configurations.getting {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
@file:OptIn(ExperimentalUnsignedTypes::class)

package org.jetbrains.kotlinx.dataframe

import org.jetbrains.kotlinx.dataframe.impl.columns.BOOLEAN
import org.jetbrains.kotlinx.dataframe.impl.columns.BYTE
import org.jetbrains.kotlinx.dataframe.impl.columns.CHAR
import org.jetbrains.kotlinx.dataframe.impl.columns.ColumnDataHolderImpl
import org.jetbrains.kotlinx.dataframe.impl.columns.DOUBLE
import org.jetbrains.kotlinx.dataframe.impl.columns.FLOAT
import org.jetbrains.kotlinx.dataframe.impl.columns.INT
import org.jetbrains.kotlinx.dataframe.impl.columns.LONG
import org.jetbrains.kotlinx.dataframe.impl.columns.SHORT
import org.jetbrains.kotlinx.dataframe.impl.columns.UBYTE
import org.jetbrains.kotlinx.dataframe.impl.columns.UINT
import org.jetbrains.kotlinx.dataframe.impl.columns.ULONG
import org.jetbrains.kotlinx.dataframe.impl.columns.USHORT
import org.jetbrains.kotlinx.dataframe.impl.columns.ofBoxedArray
import org.jetbrains.kotlinx.dataframe.impl.columns.ofCollection
import org.jetbrains.kotlinx.dataframe.impl.columns.ofPrimitiveArray
import kotlin.reflect.KType
import kotlin.reflect.typeOf

/**
* Represents the contents of a column; however, it may be implemented.
* The default implementation is found at [ColumnDataHolderImpl].
*/
public interface ColumnDataHolder<T> : List<T> {

public fun toSet(): Set<T>

public operator fun get(range: IntRange): List<T>

public fun add(element: T)

public fun canAdd(element: T): Boolean

public val distinct: Lazy<Set<T>>

public companion object
}

public fun <T> Collection<T>.toColumnDataHolder(type: KType, distinct: Lazy<Set<T>>? = null): ColumnDataHolder<T> =
ColumnDataHolder.ofCollection(this, type, distinct)

public inline fun <reified T> Collection<T>.toColumnDataHolder(distinct: Lazy<Set<T>>? = null): ColumnDataHolder<T> =
this.toColumnDataHolder(typeOf<T>(), distinct)

public fun <T> Array<T>.toColumnDataHolder(type: KType, distinct: Lazy<Set<T>>? = null): ColumnDataHolder<T> =
ColumnDataHolder.ofBoxedArray(this, type, distinct)

public inline fun <reified T> Array<T>.toColumnDataHolder(distinct: Lazy<Set<T>>? = null): ColumnDataHolder<T> =
this.toColumnDataHolder(typeOf<T>(), distinct)

public fun BooleanArray.asColumnDataHolder(distinct: Lazy<Set<Boolean>>? = null): ColumnDataHolder<Boolean> =
ColumnDataHolder.ofPrimitiveArray(this, BOOLEAN, distinct)

public fun ByteArray.asColumnDataHolder(distinct: Lazy<Set<Byte>>? = null): ColumnDataHolder<Byte> =
ColumnDataHolder.ofPrimitiveArray(this, BYTE, distinct)

public fun ShortArray.asColumnDataHolder(distinct: Lazy<Set<Short>>? = null): ColumnDataHolder<Short> =
ColumnDataHolder.ofPrimitiveArray(this, SHORT, distinct)

public fun IntArray.asColumnDataHolder(distinct: Lazy<Set<Int>>? = null): ColumnDataHolder<Int> =
ColumnDataHolder.ofPrimitiveArray(this, INT, distinct)

public fun LongArray.asColumnDataHolder(distinct: Lazy<Set<Long>>? = null): ColumnDataHolder<Long> =
ColumnDataHolder.ofPrimitiveArray(this, LONG, distinct)

public fun FloatArray.asColumnDataHolder(distinct: Lazy<Set<Float>>? = null): ColumnDataHolder<Float> =
ColumnDataHolder.ofPrimitiveArray(this, FLOAT, distinct)

public fun DoubleArray.asColumnDataHolder(distinct: Lazy<Set<Double>>? = null): ColumnDataHolder<Double> =
ColumnDataHolder.ofPrimitiveArray(this, DOUBLE, distinct)

public fun CharArray.asColumnDataHolder(distinct: Lazy<Set<Char>>? = null): ColumnDataHolder<Char> =
ColumnDataHolder.ofPrimitiveArray(this, CHAR, distinct)

public fun UByteArray.asColumnDataHolder(distinct: Lazy<Set<UByte>>? = null): ColumnDataHolder<UByte> =
ColumnDataHolder.ofPrimitiveArray(this, UBYTE, distinct)

public fun UShortArray.asColumnDataHolder(distinct: Lazy<Set<UShort>>? = null): ColumnDataHolder<UShort> =
ColumnDataHolder.ofPrimitiveArray(this, USHORT, distinct)

public fun UIntArray.asColumnDataHolder(distinct: Lazy<Set<UInt>>? = null): ColumnDataHolder<UInt> =
ColumnDataHolder.ofPrimitiveArray(this, UINT, distinct)

public fun ULongArray.asColumnDataHolder(distinct: Lazy<Set<ULong>>? = null): ColumnDataHolder<ULong> =
ColumnDataHolder.ofPrimitiveArray(this, ULONG, distinct)
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ import org.jetbrains.kotlinx.dataframe.impl.columns.FrameColumnImpl
import org.jetbrains.kotlinx.dataframe.impl.columns.ValueColumnImpl
import org.jetbrains.kotlinx.dataframe.impl.columns.addPath
import org.jetbrains.kotlinx.dataframe.impl.columns.guessColumnType
import org.jetbrains.kotlinx.dataframe.impl.columns.ofCollection
import org.jetbrains.kotlinx.dataframe.impl.columns.ofBoxedArray
import org.jetbrains.kotlinx.dataframe.impl.columns.toColumnKind
import org.jetbrains.kotlinx.dataframe.impl.getValuesType
import org.jetbrains.kotlinx.dataframe.impl.splitByIndices
Expand All @@ -42,6 +44,49 @@ public interface DataColumn<out T> : BaseColumn<T> {

public companion object {

public fun <T> createValueColumn(
name: String,
values: ColumnDataHolder<T>,
type: KType,
defaultValue: T? = null,
): ValueColumn<T> = ValueColumnImpl(values, name, type, defaultValue)

public fun createValueColumn(name: String, values: BooleanArray): ValueColumn<Boolean> =
createValueColumn(name, values.asColumnDataHolder(), typeOf<Boolean>())

public fun createValueColumn(name: String, values: ByteArray): ValueColumn<Byte> =
createValueColumn(name, values.asColumnDataHolder(), typeOf<Byte>())

public fun createValueColumn(name: String, values: ShortArray): ValueColumn<Short> =
createValueColumn(name, values.asColumnDataHolder(), typeOf<Short>())

public fun createValueColumn(name: String, values: IntArray): ValueColumn<Int> =
createValueColumn(name, values.asColumnDataHolder(), typeOf<Int>())

public fun createValueColumn(name: String, values: LongArray): ValueColumn<Long> =
createValueColumn(name, values.asColumnDataHolder(), typeOf<Long>())

public fun createValueColumn(name: String, values: FloatArray): ValueColumn<Float> =
createValueColumn(name, values.asColumnDataHolder(), typeOf<Float>())

public fun createValueColumn(name: String, values: DoubleArray): ValueColumn<Double> =
createValueColumn(name, values.asColumnDataHolder(), typeOf<Double>())

public fun createValueColumn(name: String, values: CharArray): ValueColumn<Char> =
createValueColumn(name, values.asColumnDataHolder(), typeOf<Char>())

public fun createValueColumn(name: String, values: UByteArray): ValueColumn<UByte> =
createValueColumn(name, values.asColumnDataHolder(), typeOf<UByte>())

public fun createValueColumn(name: String, values: UShortArray): ValueColumn<UShort> =
createValueColumn(name, values.asColumnDataHolder(), typeOf<UShort>())

public fun createValueColumn(name: String, values: UIntArray): ValueColumn<UInt> =
createValueColumn(name, values.asColumnDataHolder(), typeOf<UInt>())

public fun createValueColumn(name: String, values: ULongArray): ValueColumn<ULong> =
createValueColumn(name, values.asColumnDataHolder(), typeOf<ULong>())

/**
* Creates [ValueColumn] using given [name], [values] and [type].
*
Expand All @@ -56,7 +101,15 @@ public interface DataColumn<out T> : BaseColumn<T> {
type: KType,
infer: Infer = Infer.None,
defaultValue: T? = null,
): ValueColumn<T> = ValueColumnImpl(values, name, getValuesType(values, type, infer), defaultValue)
): ValueColumn<T> {
val valueType = getValuesType(values, type, infer)
return createValueColumn(
name = name,
values = ColumnDataHolder.ofCollection(values, valueType),
type = valueType,
defaultValue = defaultValue,
)
}

/**
* Creates [ValueColumn] using given [name], [values] and reified column [type].
Expand All @@ -74,25 +127,56 @@ public interface DataColumn<out T> : BaseColumn<T> {
infer: Infer = Infer.None,
): ValueColumn<T> =
createValueColumn(
name,
values,
getValuesType(
values,
typeOf<T>(),
infer,
name = name,
values = values,
type = getValuesType(
values = values,
type = typeOf<T>(),
infer = infer,
),
)

public fun <T> createValueColumn(
name: String,
values: Array<T>,
type: KType,
infer: Infer = Infer.None,
defaultValue: T? = null,
): ValueColumn<T> {
val valueType = getValuesType(values.asList(), type, infer)
return createValueColumn(
name = name,
values = ColumnDataHolder.ofBoxedArray(values, valueType),
type = valueType,
defaultValue = defaultValue,
)
}

public inline fun <reified T> createValueColumn(
name: String,
values: Array<T>,
infer: Infer = Infer.None,
): ValueColumn<T> =
createValueColumn(
name = name,
values = values,
type = getValuesType(values.asList(), typeOf<T>(), infer),
)

public fun <T> createColumnGroup(name: String, df: DataFrame<T>): ColumnGroup<T> = ColumnGroupImpl(name, df)

public fun <T> createFrameColumn(name: String, df: DataFrame<T>, startIndices: Iterable<Int>): FrameColumn<T> =
FrameColumnImpl(name, df.splitByIndices(startIndices.asSequence()).toList(), lazy { df.schema() })
FrameColumnImpl(
name,
df.splitByIndices(startIndices.asSequence()).toList().toColumnDataHolder(),
lazy { df.schema() },
)

public fun <T> createFrameColumn(
name: String,
groups: List<DataFrame<T>>,
schema: Lazy<DataFrameSchema>? = null,
): FrameColumn<T> = FrameColumnImpl(name, groups, schema)
): FrameColumn<T> = FrameColumnImpl(name, groups.toColumnDataHolder(), schema)

public fun <T> createWithTypeInference(
name: String,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ public interface ColumnSelectionDsl<out T> : ColumnsContainer<T> {
/**
* Retrieves the value of this [ColumnPath] from the [DataFrame].
* This is a shorthand for [getColumn][ColumnsContainer.getColumn]`(myColumnPath)` and
* is most often used in combination with `operator fun String.get(column: String)`,
* is most often used in combination with `operator fun String.get(column: String)`,
* for instance:
* ```kotlin
* "myColumn"["myNestedColumn"]<NestedColumnType>()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,14 @@ import org.jetbrains.kotlinx.dataframe.columns.ColumnPath
import org.jetbrains.kotlinx.dataframe.columns.ColumnSet
import org.jetbrains.kotlinx.dataframe.columns.ColumnsResolver
import org.jetbrains.kotlinx.dataframe.columns.SingleColumn
import org.jetbrains.kotlinx.dataframe.documentation.DslGrammarTemplateColumnsSelectionDsl
import org.jetbrains.kotlinx.dataframe.documentation.DslGrammarTemplateColumnsSelectionDsl.DslGrammarTemplate
import org.jetbrains.kotlinx.dataframe.documentation.ExcludeFromSources
import org.jetbrains.kotlinx.dataframe.documentation.ExportAsHtml
import org.jetbrains.kotlinx.dataframe.documentation.Indent
import org.jetbrains.kotlinx.dataframe.documentation.LineBreak
import org.jetbrains.kotlinx.dataframe.documentation.SelectingColumns
import org.jetbrains.kotlinx.dataframe.impl.DataFrameReceiver
import org.jetbrains.kotlinx.dataframe.impl.columns.ColumnsList
import org.jetbrains.kotlinx.dataframe.util.COL_SELECT_DSL_LIST_DATACOLUMN_GET
import org.jetbrains.kotlinx.dataframe.util.COL_SELECT_DSL_LIST_DATACOLUMN_GET_REPLACE
Expand Down Expand Up @@ -187,7 +195,7 @@ public interface ColumnsSelectionDsl<out T> : // SingleColumn<DataRow<T>>
*
* ### What can be called directly in the [Columns Selection DSL][org.jetbrains.kotlinx.dataframe.api.ColumnsSelectionDsl]:
*
*
*
* &nbsp;&nbsp;&nbsp;&nbsp;
*
* [`column`][org.jetbrains.kotlinx.dataframe.documentation.DslGrammarTemplateColumnsSelectionDsl.DslGrammarTemplate.ColumnDef]` `[**`..`**][org.jetbrains.kotlinx.dataframe.api.ColumnRangeColumnsSelectionDsl.rangeTo]` `[`column`][org.jetbrains.kotlinx.dataframe.documentation.DslGrammarTemplateColumnsSelectionDsl.DslGrammarTemplate.ColumnDef]
Expand Down Expand Up @@ -250,7 +258,7 @@ public interface ColumnsSelectionDsl<out T> : // SingleColumn<DataRow<T>>
*
* ### What can be called on a [ColumnSet][org.jetbrains.kotlinx.dataframe.columns.ColumnSet]:
*
*
*
* &nbsp;&nbsp;&nbsp;&nbsp;
*
* [`columnSet`][org.jetbrains.kotlinx.dataframe.documentation.DslGrammarTemplateColumnsSelectionDsl.DslGrammarTemplate.ColumnSetDef]
Expand Down Expand Up @@ -313,7 +321,7 @@ public interface ColumnsSelectionDsl<out T> : // SingleColumn<DataRow<T>>
*
* ### What can be called on a [Column Group (reference)][org.jetbrains.kotlinx.dataframe.documentation.DslGrammarTemplateColumnsSelectionDsl.DslGrammarTemplate.ColumnGroupDef]:
*
*
*
* &nbsp;&nbsp;&nbsp;&nbsp;
*
* [`columnGroup`][org.jetbrains.kotlinx.dataframe.documentation.DslGrammarTemplateColumnsSelectionDsl.DslGrammarTemplate.ColumnGroupDef]
Expand Down Expand Up @@ -391,7 +399,16 @@ public interface ColumnsSelectionDsl<out T> : // SingleColumn<DataRow<T>>
*
*
*/
public interface DslGrammar
public interface DslGrammar {








}

/**
* Invokes the given [ColumnsSelector] using this [ColumnsSelectionDsl].
Expand Down
Loading