diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/CanParseUtils.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/CanParseUtils.kt new file mode 100644 index 000000000..909f5dec5 --- /dev/null +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/CanParseUtils.kt @@ -0,0 +1,152 @@ +package org.jetbrains.kotlinx.dataframe.impl + +import kotlin.time.Duration +import kotlin.time.DurationUnit + +/** + * Checks if the string can be parsed as a duration without throwing an exception. + * + * The logic is taken from [Duration.parse] (Kotlin version 2.0.20), + * so it should return the same result. + * + * @param value the string to check + */ +internal fun Duration.Companion.canParse(value: String): Boolean { + var length = value.length + if (length == 0) return false + var index = 0 + val infinityString = "Infinity" + when (value[index]) { + '+', '-' -> index++ + } + val hasSign = index > 0 + when { + length <= index -> return false + + value[index] == 'P' -> { + if (++index == length) return false + val nonDigitSymbols = "+-." + var isTimeComponent = false + var prevUnit: DurationUnit? = null + while (index < length) { + if (value[index] == 'T') { + if (isTimeComponent || ++index == length) return false + isTimeComponent = true + continue + } + val component = value.substringWhile(index) { it in '0'..'9' || it in nonDigitSymbols } + if (component.isEmpty()) return false + index += component.length + val unitChar = value.getOrElse(index) { return false } + index++ + val unit = durationUnitByIsoCharOrNull(unitChar, isTimeComponent) ?: return false + if (prevUnit != null && prevUnit <= unit) return false + prevUnit = unit + } + } + + value.regionMatches( + thisOffset = index, + other = infinityString, + otherOffset = 0, + length = maxOf(length - index, infinityString.length), + ignoreCase = true, + ) -> return true + + else -> { + // parse default string format + var prevUnit: DurationUnit? = null + var afterFirst = false + var allowSpaces = !hasSign + if (hasSign && value[index] == '(' && value.last() == ')') { + allowSpaces = true + if (++index == --length) return false + } + while (index < length) { + if (afterFirst && allowSpaces) { + index = value.skipWhile(index) { it == ' ' } + } + afterFirst = true + val component = value.substringWhile(index) { it in '0'..'9' || it == '.' } + if (component.isEmpty()) return false + index += component.length + val unitName = value.substringWhile(index) { it in 'a'..'z' } + index += unitName.length + val unit = durationUnitByShortNameOrNull(unitName) ?: return false + if (prevUnit != null && prevUnit <= unit) return false + prevUnit = unit + val dotIndex = component.indexOf('.') + if (dotIndex > 0) { + if (index < length) return false + } + } + } + } + return true +} + +/** + * Checks if the string can be parsed as a java duration without throwing an exception. + */ +internal fun javaDurationCanParse(value: String): Boolean = isoDurationRegex.matches(value) + +/** + * regex from [java.time.Duration.Lazy.PATTERN], it represents the ISO-8601 duration format. + */ +private val isoDurationRegex = Regex( + pattern = "[-+]?P?:[-+]?[0-9]+D?T?:[-+]?[0-9]+H??:[-+]?[0-9]+M??:[-+]?[0-9]+?:[.,][0-9]{0,9}?S??", + option = RegexOption.IGNORE_CASE, +) + +/** + * Copy of [kotlin.time.substringWhile] (Kotlin version 2.0.20). + */ +private inline fun String.substringWhile(startIndex: Int, predicate: (Char) -> Boolean): String = + substring(startIndex, skipWhile(startIndex, predicate)) + +/** + * Copy of [kotlin.time.skipWhile] (Kotlin version 2.0.20). + */ +private inline fun String.skipWhile(startIndex: Int, predicate: (Char) -> Boolean): Int { + var i = startIndex + while (i < length && predicate(this[i])) i++ + return i +} + +/** + * Copy of [kotlin.time.durationUnitByIsoChar] (Kotlin version 2.0.20). + */ +private fun durationUnitByIsoCharOrNull(isoChar: Char, isTimeComponent: Boolean): DurationUnit? = + when { + !isTimeComponent -> { + when (isoChar) { + 'D' -> DurationUnit.DAYS + + else -> null + } + } + + else -> { + when (isoChar) { + 'H' -> DurationUnit.HOURS + 'M' -> DurationUnit.MINUTES + 'S' -> DurationUnit.SECONDS + else -> null + } + } + } + +/** + * Copy of [kotlin.time.durationUnitByShortName] (Kotlin version 2.0.20). + */ +private fun durationUnitByShortNameOrNull(shortName: String): DurationUnit? = + when (shortName) { + "ns" -> DurationUnit.NANOSECONDS + "us" -> DurationUnit.MICROSECONDS + "ms" -> DurationUnit.MILLISECONDS + "s" -> DurationUnit.SECONDS + "m" -> DurationUnit.MINUTES + "h" -> DurationUnit.HOURS + "d" -> DurationUnit.DAYS + else -> null + } diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt index e07e92638..21b227377 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt @@ -4,10 +4,12 @@ import kotlinx.datetime.Instant import kotlinx.datetime.LocalDate import kotlinx.datetime.LocalDateTime import kotlinx.datetime.LocalTime +import kotlinx.datetime.format.DateTimeComponents import kotlinx.datetime.toKotlinLocalDate import kotlinx.datetime.toKotlinLocalDateTime import kotlinx.datetime.toKotlinLocalTime import org.jetbrains.kotlinx.dataframe.AnyFrame +import org.jetbrains.kotlinx.dataframe.AnyRow import org.jetbrains.kotlinx.dataframe.ColumnsSelector import org.jetbrains.kotlinx.dataframe.DataColumn import org.jetbrains.kotlinx.dataframe.DataFrame @@ -27,11 +29,14 @@ import org.jetbrains.kotlinx.dataframe.columns.size import org.jetbrains.kotlinx.dataframe.columns.values import org.jetbrains.kotlinx.dataframe.exceptions.TypeConversionException import org.jetbrains.kotlinx.dataframe.hasNulls +import org.jetbrains.kotlinx.dataframe.impl.canParse import org.jetbrains.kotlinx.dataframe.impl.catchSilent import org.jetbrains.kotlinx.dataframe.impl.createStarProjectedType +import org.jetbrains.kotlinx.dataframe.impl.javaDurationCanParse import org.jetbrains.kotlinx.dataframe.io.isURL import org.jetbrains.kotlinx.dataframe.io.readJsonStr import org.jetbrains.kotlinx.dataframe.typeClass +import java.math.BigDecimal import java.net.URL import java.text.NumberFormat import java.text.ParsePosition @@ -55,10 +60,17 @@ internal interface StringParser { fun applyOptions(options: ParserOptions?): (String) -> T? + /** If a parser with one of these types is run, this parser can be skipped. */ + val coveredBy: Collection + val type: KType } -internal open class DelegatedStringParser(override val type: KType, val handle: (String) -> T?) : StringParser { +internal open class DelegatedStringParser( + override val type: KType, + override val coveredBy: Collection, + val handle: (String) -> T?, +) : StringParser { override fun toConverter(options: ParserOptions?): TypeConverter { val nulls = options?.nullStrings ?: Parsers.nulls return { @@ -76,6 +88,7 @@ internal open class DelegatedStringParser(override val type: KType, val handl internal class StringParserWithFormat( override val type: KType, + override val coveredBy: Collection, val getParser: (ParserOptions?) -> ((String) -> T?), ) : StringParser { override fun toConverter(options: ParserOptions?): TypeConverter { @@ -150,6 +163,30 @@ internal object Parsers : GlobalParserOptions { return null } + private fun String.toInstantOrNull(): Instant? { + // Default format used by Instant.parse + val format = DateTimeComponents.Formats.ISO_DATE_TIME_OFFSET + return catchSilent { + // low chance throwing exception, thanks to using parseOrNull instead of parse + format.parseOrNull(this)?.toInstantUsingOffset() + } + } + + private fun String.toJavaInstantOrNull(): JavaInstant? { + // Default format used by java.time.Instant.parse + val format = DateTimeFormatter.ISO_INSTANT + return catchSilent { + // low chance throwing exception, thanks to using parseUnresolved instead of parse + val parsePosition = ParsePosition(0) + val accessor = format.parseUnresolved(this, parsePosition) + if (accessor != null && parsePosition.errorIndex == -1) { + JavaInstant.from(accessor) + } else { + null + } + } + } + private fun String.toLocalDateTimeOrNull(formatter: DateTimeFormatter?): LocalDateTime? = toJavaLocalDateTimeOrNull(formatter)?.toKotlinLocalDateTime() @@ -196,6 +233,20 @@ internal object Parsers : GlobalParserOptions { private fun String.toLocalTimeOrNull(formatter: DateTimeFormatter?): LocalTime? = toJavaLocalTimeOrNull(formatter)?.toKotlinLocalTime() + private fun String.toJavaDurationOrNull(): JavaDuration? = + if (javaDurationCanParse(this)) { + catchSilent { JavaDuration.parse(this) } // will likely succeed + } else { + null + } + + private fun String.toDurationOrNull(): Duration? = + if (Duration.canParse(this)) { + catchSilent { Duration.parse(this) } // will likely succeed + } else { + null + } + private fun String.parseDouble(format: NumberFormat) = when (uppercase(Locale.getDefault())) { "NAN" -> Double.NaN @@ -219,21 +270,23 @@ internal object Parsers : GlobalParserOptions { } } - inline fun stringParser(catch: Boolean = false, noinline body: (String) -> T?): StringParser = + inline fun stringParser( + catch: Boolean = false, + coveredBy: Set = emptySet(), + noinline body: (String) -> T?, + ): StringParser = if (catch) { - DelegatedStringParser(typeOf()) { - try { - body(it) - } catch (e: Throwable) { - null - } + DelegatedStringParser(typeOf(), coveredBy) { + catchSilent { body(it) } } } else { - DelegatedStringParser(typeOf(), body) + DelegatedStringParser(typeOf(), coveredBy, body) } - inline fun stringParserWithOptions(noinline body: (ParserOptions?) -> ((String) -> T?)) = - StringParserWithFormat(typeOf(), body) + inline fun stringParserWithOptions( + coveredBy: Set = emptySet(), + noinline body: (ParserOptions?) -> ((String) -> T?), + ): StringParserWithFormat = StringParserWithFormat(typeOf(), coveredBy, body) private val parserToDoubleWithOptions = stringParserWithOptions { options -> val numberFormat = NumberFormat.getInstance(options?.locale ?: Locale.getDefault()) @@ -243,69 +296,107 @@ internal object Parsers : GlobalParserOptions { private val parsersOrder = listOf( // Int - stringParser { it.toIntOrNull() }, + stringParser { it.toIntOrNull() }, // Long - stringParser { it.toLongOrNull() }, + stringParser { it.toLongOrNull() }, // kotlinx.datetime.Instant - stringParser { catchSilent { Instant.parse(it) } }, - // java.time.Instant - stringParser { catchSilent { JavaInstant.parse(it) } }, + stringParser { + it.toInstantOrNull() + }, +// stringParser(true) { +// Instant.parse(it) +// }, // TODO remove + // java.time.Instant, will be skipped if kotlinx.datetime.Instant is already checked + stringParser(coveredBy = setOf(typeOf())) { + it.toJavaInstantOrNull() + }, +// stringParser(catch = true /*coveredBy = setOf(typeOf())*/) { +// JavaInstant.parse(it) +// }, // TODO remove // kotlinx.datetime.LocalDateTime - stringParserWithOptions { options -> + stringParserWithOptions { options -> val formatter = options?.getDateTimeFormatter() val parser = { it: String -> it.toLocalDateTimeOrNull(formatter) } parser }, - // java.time.LocalDateTime - stringParserWithOptions { options -> + // java.time.LocalDateTime, will be skipped if kotlinx.datetime.LocalDateTime is already checked + stringParserWithOptions(coveredBy = setOf(typeOf())) { options -> val formatter = options?.getDateTimeFormatter() val parser = { it: String -> it.toJavaLocalDateTimeOrNull(formatter) } parser }, // kotlinx.datetime.LocalDate - stringParserWithOptions { options -> + stringParserWithOptions { options -> val formatter = options?.getDateTimeFormatter() val parser = { it: String -> it.toLocalDateOrNull(formatter) } parser }, - // java.time.LocalDate - stringParserWithOptions { options -> + // java.time.LocalDate, will be skipped if kotlinx.datetime.LocalDate is already checked + stringParserWithOptions(coveredBy = setOf(typeOf())) { options -> val formatter = options?.getDateTimeFormatter() val parser = { it: String -> it.toJavaLocalDateOrNull(formatter) } parser }, // kotlin.time.Duration - stringParser { catchSilent { Duration.parse(it) } }, - // java.time.Duration - stringParser { catchSilent { JavaDuration.parse(it) } }, + stringParser { + it.toDurationOrNull() + }, +// stringParser(true) { +// Duration.parse(it) +// }, // TODO remove + // java.time.Duration, will be skipped if kotlin.time.Duration is already checked + stringParser(coveredBy = setOf(typeOf())) { + it.toJavaDurationOrNull() + }, +// stringParser(true/*coveredBy = setOf(typeOf())*/) { +// JavaDuration.parse(it) +// }, // TODO remove // kotlinx.datetime.LocalTime - stringParserWithOptions { options -> + stringParserWithOptions { options -> val formatter = options?.getDateTimeFormatter() val parser = { it: String -> it.toLocalTimeOrNull(formatter) } parser }, - // java.time.LocalTime - stringParserWithOptions { options -> + // java.time.LocalTime, will be skipped if kotlinx.datetime.LocalTime is already checked + stringParserWithOptions(coveredBy = setOf(typeOf())) { options -> val formatter = options?.getDateTimeFormatter() val parser = { it: String -> it.toJavaLocalTimeOrNull(formatter) } parser }, // java.net.URL - stringParser { it.toUrlOrNull() }, + stringParser { it.toUrlOrNull() }, // Double, with explicit number format or taken from current locale parserToDoubleWithOptions, // Double, with POSIX format - stringParser { it.parseDouble(NumberFormat.getInstance(Locale.forLanguageTag("C.UTF-8"))) }, + stringParser { it.parseDouble(NumberFormat.getInstance(Locale.forLanguageTag("C.UTF-8"))) }, // Boolean - stringParser { it.toBooleanOrNull() }, + stringParser { it.toBooleanOrNull() }, // BigDecimal - stringParser { it.toBigDecimalOrNull() }, - stringParser(catch = true) { if (it.startsWith("[")) DataFrame.readJsonStr(it) else null }, - stringParser(catch = true) { if (it.startsWith("{")) DataFrame.readJsonStr(it).single() else null }, - stringParser { it }, // must be last in the list of parsers to return original unparsed string + stringParser { it.toBigDecimalOrNull() }, + // JSON array as DataFrame<*> + stringParser(catch = true) { + val trimmed = it.trim() + if (trimmed.startsWith("[") && trimmed.endsWith("]")) { + DataFrame.readJsonStr(it) + } else { + null + } + }, + // JSON object as DataRow<*> + stringParser(catch = true) { + val trimmed = it.trim() + if (trimmed.startsWith("{") && trimmed.endsWith("}")) { + DataFrame.readJsonStr(it).single() + } else { + null + } + }, + // No parser found, return as String + // must be last in the list of parsers to return original unparsed string + stringParser { it }, ) - private val parsersMap = parsersOrder.associateBy { it.type } + internal val parsersMap = parsersOrder.associateBy { it.type } val size: Int = parsersOrder.size @@ -352,49 +443,76 @@ internal object Parsers : GlobalParserOptions { } } +/** + * Tries to parse a column of strings into a column of a different type. + * Each parser in [Parsers] is run in order until a valid parser is found, + * a.k.a. that parser was able to parse all values in the column successfully. If a parser + * fails to parse any value, the next parser is tried. If all the others fail, the final parser + * simply returns the original string, leaving the column unchanged. + * + * Parsers that are [covered by][StringParser.coveredBy] other parsers are skipped. + * + * @param options options for parsing, like providing a locale or a custom date-time formatter + * @throws IllegalStateException if no valid parser is found (unlikely, unless the `String` parser is disabled) + * @return a new column with parsed values + */ internal fun DataColumn.tryParseImpl(options: ParserOptions?): DataColumn<*> { - var parserId = 0 - val parsedValues = mutableListOf() - var hasNulls: Boolean - var hasNotNulls: Boolean - var nullStringParsed: Boolean + val columnSize = size + val parsedValues = ArrayList(columnSize) + var hasNulls: Boolean = false + var hasNotNulls: Boolean = false + var nullStringParsed: Boolean = false val nulls = options?.nullStrings ?: Parsers.nulls - do { - val parser = Parsers[parserId].applyOptions(options) + + val parsersToCheck = Parsers.parsersMap + val parserTypesToCheck = parsersToCheck.keys + + var correctParser: StringParser<*>? = null + for ((_, parser) in parsersToCheck) { + if (parser.coveredBy.any { it in parserTypesToCheck }) continue + + val parserWithOptions = parser.applyOptions(options) parsedValues.clear() hasNulls = false hasNotNulls = false nullStringParsed = false - for (str in values) { + for (str in this) { when { str == null -> { - parsedValues.add(null) + parsedValues += null hasNulls = true } - nulls.contains(str) -> { - parsedValues.add(null) + str in nulls -> { + parsedValues += null hasNulls = true nullStringParsed = true } else -> { val trimmed = str.trim() - val res = parser(trimmed) + val res = parserWithOptions(trimmed) if (res == null) { - parserId++ - break + continue } - parsedValues.add(res) + parsedValues += res hasNotNulls = true } } } - } while (parserId < Parsers.size && parsedValues.size != size) - check(parserId < Parsers.size) { "Valid parser not found" } - val type = (if (hasNotNulls) Parsers[parserId].type else this.type()).withNullability(hasNulls) - if (type.jvmErasure == String::class && !nullStringParsed) return this // nothing parsed + // break when everything is parsed + if (parsedValues.size >= columnSize) { + correctParser = parser + break + } + } + check(correctParser != null) { "Valid parser not found" } + + val type = (if (hasNotNulls) correctParser.type else this.type()).withNullability(hasNulls) + if (type.jvmErasure == String::class && !nullStringParsed) { + return this // nothing parsed + } return DataColumn.create(name(), parsedValues, type) }