Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

KMP #30

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open

KMP #30

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions crux-kmp/build.gradle.kts
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
plugins {
alias(libs.plugins.kotlinMultiplatform)
alias(libs.plugins.androidLibrary)
}

kotlin {
androidTarget {
compilations.all {
kotlinOptions {
jvmTarget = "1.8"
}
}
}

listOf(
iosX64(),
iosArm64(),
iosSimulatorArm64()
).forEach {
it.binaries.framework {
baseName = "shared"
isStatic = true
}
}

sourceSets {
commonMain{
dependencies {
implementation(libs.ktor.client.core)
implementation(libs.ksoup)
implementation(libs.klaxon)
}
}
}
}

android {
namespace = "com.chimbori.crux_kmp"
compileSdk = 34
defaultConfig {
minSdk = 24
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
package com.chimbori.crux_kmp

class AndroidPlatform : Platform {
override val name: String = "Android ${android.os.Build.VERSION.SDK_INT}"
}

actual fun getPlatform(): Platform = AndroidPlatform()
93 changes: 93 additions & 0 deletions crux-kmp/src/commonMain/kotlin/com/chimbori/crux_kmp/Crux.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
package com.chimbori.crux

import com.chimbori.crux_kmp.api.Extractor
import com.chimbori.crux_kmp.api.Plugin
import com.chimbori.crux_kmp.api.Resource
import com.chimbori.crux_kmp.api.Rewriter
import com.chimbori.crux_kmp.common.CHROME_USER_AGENT
import com.chimbori.crux_kmp.plugins.AmpRedirector
import com.chimbori.crux_kmp.plugins.FacebookUrlRewriter
import com.chimbori.crux_kmp.plugins.FaviconExtractor
import com.chimbori.crux_kmp.plugins.GoogleUrlRewriter
import com.chimbori.crux_kmp.plugins.HtmlMetadataExtractor
import com.chimbori.crux_kmp.plugins.TrackingParameterRemover
import com.chimbori.crux_kmp.plugins.WebAppManifestParser
import com.fleeksoft.ksoup.nodes.Document
import io.ktor.client.HttpClient
import io.ktor.http.Url
import io.ktor.http.headers
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.IO
import kotlinx.coroutines.withContext

/**
* An ordered list of default plugins configured in Crux. Callers can override and provide their own list, or pick and
* choose from the set of available default plugins to create their own configuration.
*/
public fun createDefaultPlugins(httpClient: HttpClient): List<Plugin> = listOf(
// Rewriters

// Static redirectors go first, to avoid getting stuck into CAPTCHAs.
GoogleUrlRewriter(),
FacebookUrlRewriter(),
// Remove any tracking parameters remaining.
TrackingParameterRemover(),

// Extractors

// Parses many standard HTML metadata attributes. Fetches the Web page, so this must be the first [Extractor].
HtmlMetadataExtractor(httpClient),
// Prefer canonical URLs over AMP URLs.
AmpRedirector(refetchContentFromCanonicalUrl = true, httpClient),
// Fetches and parses the Web Manifest. May replace existing favicon URL with one from the manifest.json.
WebAppManifestParser(httpClient),
// Extracts the best possible favicon from all the markup available on the page itself.
FaviconExtractor(),
)

/**
* Crux can be configured with a set of plugins, including custom ones, in sequence. Each plugin can optionally process
* resource metadata, can make additional HTTP requests if necessary, and pass along updated metadata to the next plugin
* in the chain.
*/
public class Crux(
/** Select from available plugins, or provide custom plugins for Crux to use. */
private val plugins: List<Plugin>? = null,

/** If the calling app has its own instance of [HttpClient], use it, otherwise Crux can create and use its own. */
httpClient: HttpClient = createCruxOkHttpClient(),
) {

private val activePlugins: List<Plugin> = plugins ?: createDefaultPlugins(httpClient)

/**
* Processes the provided URL, and returns a metadata object containing custom fields.
* @param originalUrl the URL to extract metadata and content from.
* @param parsedDoc if the calling app already has access to a parsed DOM tree, Crux can reuse it instead of
* re-parsing it. If a custom [Document] is provided, Crux will not make any HTTP requests itself, and may not follow
* HTTP redirects (but plugins may still optionally make additional HTTP requests themselves.)
*/
public suspend fun extractFrom(originalUrl: Url, parsedDoc: Document? = null): Resource =
withContext(Dispatchers.IO) {
val rewrittenUrl = activePlugins
.filterIsInstance<Rewriter>()
.fold(originalUrl) { rewrittenUrl, rewriter -> rewriter.rewrite(rewrittenUrl) }

activePlugins
.filterIsInstance<Extractor>()
.fold(Resource(url = rewrittenUrl, document = parsedDoc)) { resource, extractor ->
if (extractor.canExtract(resource.url ?: rewrittenUrl)) {
resource + extractor.extract(resource)
} else {
resource
}
}.removeNullValues()
}
}

internal fun createCruxOkHttpClient(): HttpClient = HttpClient {
followRedirects = true
headers {
append("User-Agent", CHROME_USER_AGENT)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
package com.chimbori.crux_kmp

interface Platform {
val name: String
}

expect fun getPlatform(): Platform
39 changes: 39 additions & 0 deletions crux-kmp/src/commonMain/kotlin/com/chimbori/crux_kmp/api/Fields.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package com.chimbori.crux_kmp.api

/** Well-known keys to use in [Resource.metadata]. */
public object Fields {
public const val TITLE: String = "title"
public const val DESCRIPTION: String = "description"
public const val SITE_NAME: String = "site-name"
public const val LANGUAGE: String = "language"
public const val DISPLAY: String = "display"
public const val ORIENTATION: String = "orientation"
public const val PUBLISHED_AT: String = "published_at"
public const val MODIFIED_AT: String = "modified_at"

public const val THEME_COLOR_HEX: String = "theme-color-hex"
public const val THEME_COLOR_HTML: String = "theme-color-html" // Named colors like "aliceblue"
public const val BACKGROUND_COLOR_HEX: String = "background-color-hex"
public const val BACKGROUND_COLOR_HTML: String = "background-color-html" // Named colors like "aliceblue"

public const val CANONICAL_URL: String = "canonical-url"
public const val AMP_URL: String = "amp-url"
public const val FAVICON_URL: String = "favicon-url"
public const val BANNER_IMAGE_URL: String = "banner-image-url"
public const val FEED_URL: String = "feed-url"
public const val VIDEO_URL: String = "video-url"
public const val WEB_APP_MANIFEST_URL: String = "web-app-manifest-url" // https://www.w3.org/TR/appmanifest/
public const val NEXT_PAGE_URL: String = "next-page-url"
public const val PREVIOUS_PAGE_URL: String = "previous-page-url"

// For image or video resources only.
public const val ALT_TEXT: String = "alt-text"
public const val WIDTH_PX: String = "width-px"
public const val HEIGHT_PX: String = "height-px"

// For articles (estimated reading time) and audio/video content (playback duration).
public const val DURATION_MS: String = "duration-ms"

public const val TWITTER_HANDLE: String = "twitter-handle"
public const val KEYWORDS_CSV: String = "keywords-csv"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package com.chimbori.crux_kmp.api

import io.ktor.http.Url

public sealed interface Plugin

/**
* Rewriters are plugins that can modify the URL before it’s processed by other plugins. They should not have access
* to the network, and should execute quickly on the main thread if necessary.
*/
public fun interface Rewriter : Plugin {
public fun rewrite(url: Url): Url
}

/**
* Crux is designed as a chain of plugins, each of which can optionally handle URLs passed to it. Each plugin is
* provided a fully-parsed HTML DOM to extract fields from, and can also make additional HTTP requests if necessary to
* retrieve additional metadata or to follow redirects.
*
* Metadata fields can be set via the [Resource.metadata] property. Plugins can also rewrite the canonical URL, and can
* provide an updated DOM tree if the canonical URL is changed. The updated URL and DOM tree will be passed on to the
* next plugin in sequence, so the exact ordering of plugins is important.
*/
public interface Extractor : Plugin {
/**
* @param url URL for the resource being processed by Crux.
* @return true if this plugin can handle the URL, false otherwise. Plugins can only inspect the [HttpUrl], without
* being able to peek at the content.
*/
public fun canExtract(url: Url): Boolean

/**
* @param request metadata & DOM content for the request being handled.
* @return a partially populated [Resource] with newly-extracted fields. Include only those fields that need to be
* set or updated; they will be merged with the set of previously-extracted fields. If no fields need to be updated,
* return `null`.
*/
public suspend fun extract(request: Resource): Resource?
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
package com.chimbori.crux_kmp.api

import com.fleeksoft.ksoup.nodes.Document
import com.fleeksoft.ksoup.nodes.Element
import io.ktor.http.Url


/** A [Resource] encapculates metadata and content related to an HTTP resource. */
public data class Resource(
/** Canonical URL for this resource. */
val url: Url? = null,

/** Parsed DOM tree for this resource, if available. */
val document: Document? = null,

/**
* Extracted and cleaned-up DOM tree for this resource, if available.
* If this is null, then article extraction has not been performed, or has failed.
*/
val article: Element? = null,

/** A holder for any kind of custom objects that library users may want to use. */
val metadata: Map<String, Any?> = emptyMap(),
) {
/** @return value of a named field in [Resource.metadata]. */
public operator fun get(key: String): Any? = metadata[key]

/**
* Merges non-null fields from another [Resource] with this object, and returns a new immutable object. Prefer to use
* this operator instead of manually merging the two objects, so that all fields are correctly merged and not clobbered.
*/
public operator fun plus(anotherResource: Resource?): Resource = Resource(
url = anotherResource?.url ?: url,
document = anotherResource?.document ?: document,
article = anotherResource?.article ?: article,
metadata = if (anotherResource?.metadata == null) metadata else metadata + anotherResource.metadata,
)

/** Removes an immutable copy of this [Resource] that only contains non-null values for each key in [metadata]. */
public fun removeNullValues(): Resource = copy(
metadata = metadata.filterValues { it != null },
)

/** For any potential extension functions to be defined on the companion object. */
public companion object
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
package com.chimbori.crux_kmp.common

import com.chimbori.crux_kmp.api.Resource
import com.fleeksoft.ksoup.Ksoup
import com.fleeksoft.ksoup.nodes.Document
import com.fleeksoft.ksoup.ported.BufferReader
import io.ktor.client.HttpClient
import io.ktor.client.plugins.ResponseException
import io.ktor.client.request.HttpRequestBuilder
import io.ktor.client.request.request
import io.ktor.client.request.url
import io.ktor.client.statement.HttpResponse
import io.ktor.client.statement.readBytes
import io.ktor.client.statement.request
import io.ktor.http.HttpMethod
import io.ktor.http.HttpStatusCode
import io.ktor.http.Url
import io.ktor.utils.io.errors.IOException
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.IO
import kotlinx.coroutines.withContext

private const val DEFAULT_BROWSER_VERSION = "100.0.0.0"

internal const val CHROME_USER_AGENT =
"Mozilla/5.0 (Linux; Android 11; Build/RQ2A.210505.003) AppleWebKit/537.36 " +
"(KHTML, like Gecko) Version/4.0 Chrome/$DEFAULT_BROWSER_VERSION Mobile Safari/537.36"

public suspend fun HttpClient.safeCall(builder: HttpRequestBuilder): HttpResponse? =
withContext(Dispatchers.IO) {
try {
[email protected](builder)
} catch (e: IOException) {
null
} catch (e: NullPointerException) {
// OkHttp sometimes tries to read a cookie which is null, causing an NPE here. The root cause
// has not been identified, but this only happens with Twitter so far.
null
} catch (e: IllegalArgumentException) {
// The URL is something like "https://" (no hostname, no path, etc.) which is clearly invalid.
null
} catch (e: ResponseException) {
// Device is offline, or this host is unreachable.
null
} catch (t: Throwable) {
// Something else really bad happened, e.g. [java.net.SocketTimeoutException].
null
}
}

public suspend fun HttpClient.safeHttpGet(url: Url): HttpResponse? {
val builder = HttpRequestBuilder()
builder.method = HttpMethod.Get
builder.url(url)
return safeCall(builder)
}

public suspend fun HttpClient.safeHttpHead(url: Url): HttpResponse? {
val builder = HttpRequestBuilder()
builder.method = HttpMethod.Head
builder.url(url)
return safeCall(builder)
}

public suspend fun HttpClient.httpGetContent(
url: Url,
onError: ((t: Throwable) -> Unit)? = null
): String? =
withContext(Dispatchers.IO) {
return@withContext safeHttpGet(url)?.use { response ->
if (response.status == HttpStatusCode.OK) {
try {
""
} catch (t: Throwable) {
onError?.invoke(t)
"null"
}
} else "null"
}
}

public suspend fun Resource.Companion.fetchFromUrl(url: Url, httpClient: HttpClient)
: Resource = withContext(Dispatchers.IO) {

val httpResponse = httpClient.safeHttpGet(url)

// If the HTTP request resulted in an HTTP redirect, use the redirected URL.
val urlToUse = if (httpResponse?.status == HttpStatusCode.OK && httpResponse.request.url != url) {
httpResponse.request.url
} else url

val docToUse: Document? = try {
httpResponse?.readBytes()?.let {
Ksoup.parse(BufferReader(it), "UTF-8", urlToUse.toString())
}
} catch (t: Throwable) {
null
}

Resource(url = urlToUse, document = docToUse)
}
Loading
Loading