Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support resultSet direct push for SparkSqlExecutor #5170

Merged
merged 12 commits into from
Sep 24, 2024
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,7 @@ object JobRequestConstants {

val LINKIS_JDBC_DEFAULT_DB = "linkis.jdbc.default.db"

val ENABLE_DIRECT_PUSH = "enableDirectPush"

val DIRECT_PUSH_FETCH_SIZE = "direct_push_fetch_size"
}
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ import org.apache.linkis.engineconn.core.EngineConnObject
import org.apache.linkis.engineconn.core.executor.ExecutorManager
import org.apache.linkis.engineconn.executor.entity.{LabelExecutor, ResourceExecutor}
import org.apache.linkis.engineconn.executor.listener.ExecutorListenerBusContext
import org.apache.linkis.governance.common.constant.job.JobRequestConstants
import org.apache.linkis.governance.common.entity.ExecutionNodeStatus
import org.apache.linkis.governance.common.paser.CodeParser
import org.apache.linkis.governance.common.protocol.task.{EngineConcurrentInfo, RequestTask}
Expand Down Expand Up @@ -88,7 +89,7 @@ abstract class ComputationExecutor(val outputPrintLimit: Int = 1000)

protected val failedTasks: Count = new Count

private var lastTask: EngineConnTask = _
protected var lastTask: EngineConnTask = _

private val MAX_TASK_EXECUTE_NUM = ComputationExecutorConf.ENGINE_MAX_TASK_EXECUTE_NUM.getValue

Expand Down Expand Up @@ -232,11 +233,13 @@ abstract class ComputationExecutor(val outputPrintLimit: Int = 1000)
}
val code = codes(index)
engineExecutionContext.setCurrentParagraph(index + 1)

response = Utils.tryCatch(if (incomplete.nonEmpty) {
executeCompletely(engineExecutionContext, code, incomplete.toString())
} else executeLine(engineExecutionContext, code)) { t =>
ErrorExecuteResponse(ExceptionUtils.getRootCauseMessage(t), t)
}

incomplete ++= code
response match {
case e: ErrorExecuteResponse =>
Expand Down Expand Up @@ -355,6 +358,12 @@ abstract class ComputationExecutor(val outputPrintLimit: Int = 1000)
engineConnTask.getProperties.get(RequestTask.RESULT_SET_STORE_PATH).toString
)
}
if (engineConnTask.getProperties.containsKey(JobRequestConstants.ENABLE_DIRECT_PUSH)) {
engineExecutionContext.setEnableDirectPush(
engineConnTask.getProperties.get(JobRequestConstants.ENABLE_DIRECT_PUSH).toString.toBoolean
)
logger.info(s"Enable direct push in engineTask ${engineConnTask.getTaskId}.")
}
logger.info(s"StorePath : ${engineExecutionContext.getStorePath.orNull}.")
engineExecutionContext.setJobId(engineConnTask.getTaskId)
engineExecutionContext.getProperties.putAll(engineConnTask.getProperties)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ class EngineExecutionContext(executor: ComputationExecutor, executorUser: String

private var totalParagraph = 0
private var currentParagraph = 0
private var enableDirectPush = false

def getTotalParagraph: Int = totalParagraph

Expand All @@ -76,6 +77,11 @@ class EngineExecutionContext(executor: ComputationExecutor, executorUser: String

def setCurrentParagraph(currentParagraph: Int): Unit = this.currentParagraph = currentParagraph

def setEnableDirectPush(enable: Boolean): Unit =
this.enableDirectPush = enable

def isEnableDirectPush: Boolean = enableDirectPush

def pushProgress(progress: Float, progressInfo: Array[JobProgressInfo]): Unit =
if (!executor.isInternalExecute) {
val listenerBus = getEngineSyncListenerBus
Expand Down
5 changes: 5 additions & 0 deletions linkis-engineconn-plugins/spark/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,11 @@
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.arrow</groupId>
<artifactId>arrow-vector</artifactId>
<version>${arrow.version}</version>
</dependency>
<dependency>
<groupId>org.codehaus.janino</groupId>
<artifactId>janino</artifactId>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.linkis.engineplugin.spark;

import org.apache.linkis.engineplugin.spark.utils.DataFrameResponse;
import org.apache.linkis.engineplugin.spark.utils.DirectPushCache;
import org.apache.linkis.server.Message;

import org.springframework.web.bind.annotation.RequestBody;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestMethod;
import org.springframework.web.bind.annotation.RestController;

import javax.servlet.http.HttpServletRequest;

import java.util.Map;

import io.swagger.annotations.Api;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

@Api(tags = "DirectPush")
@RestController
@RequestMapping(path = "directpush")
public class DirectPushRestfulApi {
private static final Logger logger = LoggerFactory.getLogger(DirectPushRestfulApi.class);

@RequestMapping(path = "pull", method = RequestMethod.POST)
public Message getDirectPushResult(
HttpServletRequest req, @RequestBody Map<String, Object> json) {
Message message = null;
try {
String taskId = (String) json.getOrDefault("taskId", null);
if (taskId == null) {
message = Message.error("taskId is null");
return message;
}
int fetchSize = (int) json.getOrDefault("fetchSize", 1000);

DataFrameResponse response = DirectPushCache.fetchResultSetOfDataFrame(taskId, fetchSize);
if (response.dataFrame() == null) {
message = Message.error("No result found for taskId: " + taskId);
} else {
message =
Message.ok()
.data("data", response.dataFrame())
.data("hasMoreData", response.hasMoreData());
}
} catch (Exception e) {
logger.error("Failed to get direct push result", e);
message = Message.error("Failed to get direct push result: " + e.getMessage());
}
return message;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,13 @@ import org.apache.linkis.engineconn.computation.executor.execute.EngineExecution
import org.apache.linkis.engineplugin.spark.common.{Kind, SparkSQL}
import org.apache.linkis.engineplugin.spark.config.SparkConfiguration
import org.apache.linkis.engineplugin.spark.entity.SparkEngineSession
import org.apache.linkis.engineplugin.spark.utils.EngineUtils
import org.apache.linkis.engineplugin.spark.utils.{ArrowUtils, DirectPushCache, EngineUtils}
import org.apache.linkis.governance.common.constant.job.JobRequestConstants
import org.apache.linkis.governance.common.paser.SQLCodeParser
import org.apache.linkis.scheduler.executer.{
ErrorExecuteResponse,
ExecuteResponse,
SuccessExecuteResponse
}
import org.apache.linkis.scheduler.executer._

import org.apache.commons.lang3.exception.ExceptionUtils
import org.apache.spark.sql.DataFrame

import java.lang.reflect.InvocationTargetException

Expand All @@ -47,6 +44,16 @@ class SparkSqlExecutor(sparkEngineSession: SparkEngineSession, id: Long)

override protected def getKind: Kind = SparkSQL()

// Only used in the scenario of direct pushing, dataFrame won't be fetched at a time,
// It will cache the lazy dataFrame in memory and return the result when client .
private def submitResultSetIterator(taskId: String, df: DataFrame): Unit = {
if (!DirectPushCache.isTaskCached(taskId)) {
DirectPushCache.submitExecuteResult(taskId, df)
} else {
logger.error(s"Task $taskId already exists in resultSet cache.")
}
}

override protected def runCode(
executor: SparkEngineConnExecutor,
code: String,
Expand Down Expand Up @@ -89,14 +96,19 @@ class SparkSqlExecutor(sparkEngineSession: SparkEngineSession, id: Long)
)
)
)
SQLSession.showDF(
sparkEngineSession.sparkContext,
jobGroup,
df,
null,
SparkConfiguration.SHOW_DF_MAX_RES.getValue,
engineExecutionContext
)

if (engineExecutionContext.isEnableDirectPush) {
submitResultSetIterator(lastTask.getTaskId, df)
} else {
SQLSession.showDF(
sparkEngineSession.sparkContext,
jobGroup,
df,
null,
SparkConfiguration.SHOW_DF_MAX_RES.getValue,
engineExecutionContext
)
}
SuccessExecuteResponse()
} catch {
case e: InvocationTargetException =>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.linkis.engineplugin.spark.utils

import org.apache.arrow.memory.RootAllocator
import org.apache.arrow.vector._
import org.apache.arrow.vector.ipc.ArrowStreamWriter
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types._

import java.io.ByteArrayOutputStream
import java.util

object ArrowUtils {

def toArrow(df: DataFrame): Array[Byte] = {
val allocator = new RootAllocator(Long.MaxValue)
val (root, fieldVectors) = createArrowVectors(df, allocator)
val outStream = new ByteArrayOutputStream()
val writer = new ArrowStreamWriter(root, null, outStream)

writer.start()
writer.writeBatch()
writer.end()
writer.close()

val arrowBytes = outStream.toByteArray
fieldVectors.foreach(_.close())
allocator.close()
arrowBytes
}

private def createArrowVectors(
df: DataFrame,
allocator: RootAllocator
): (VectorSchemaRoot, List[FieldVector]) = {
val schema = df.schema
val fieldVectors = schema.fields.map { field =>
field.dataType match {
case IntegerType =>
val vector = new IntVector(field.name, allocator)
vector.allocateNew(df.count().toInt)
vector
case LongType =>
val vector = new BigIntVector(field.name, allocator)
vector.allocateNew(df.count().toInt)
vector
case DoubleType =>
val vector = new Float8Vector(field.name, allocator)
vector.allocateNew(df.count().toInt)
vector
case BooleanType =>
val vector = new BitVector(field.name, allocator)
vector.allocateNew(df.count().toInt)
vector
case _ =>
val vector: VarCharVector = new VarCharVector(field.name, allocator)
vector.allocateNew(df.count().toInt)
vector
}
}.toList

df.collect().zipWithIndex.foreach { case (row, i) =>
for (j <- fieldVectors.indices) {
val vector = fieldVectors(j)
row.schema.fields(j).dataType match {
case IntegerType => vector.asInstanceOf[IntVector].setSafe(i, row.getInt(j))
case LongType => vector.asInstanceOf[BigIntVector].setSafe(i, row.getLong(j))
case DoubleType => vector.asInstanceOf[Float8Vector].setSafe(i, row.getDouble(j))
case BooleanType =>
vector.asInstanceOf[BitVector].setSafe(i, if (row.getBoolean(j)) 1 else 0)
case _ =>
vector.asInstanceOf[VarCharVector].setSafe(i, row.getString(j).getBytes)
}
vector.setValueCount(vector.getValueCount + 1)
}
}

val javaFieldVectors: util.ArrayList[FieldVector] = new util.ArrayList[FieldVector]()
fieldVectors.foreach(javaFieldVectors.add)
val root = new VectorSchemaRoot(javaFieldVectors)

(root, fieldVectors)
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.linkis.engineplugin.spark.utils

import org.apache.linkis.engineconn.common.conf.{EngineConnConf, EngineConnConstant}

import org.apache.spark.sql.DataFrame

import java.util.concurrent.TimeUnit

import com.google.common.cache.{Cache, CacheBuilder}

case class DataFrameResponse(dataFrame: DataFrame, hasMoreData: Boolean)

object DirectPushCache {

private val resultSet: Cache[String, DataFrame] = CacheBuilder
.newBuilder()
.expireAfterAccess(EngineConnConf.ENGINE_TASK_EXPIRE_TIME.getValue, TimeUnit.MILLISECONDS)
.maximumSize(EngineConnConstant.MAX_TASK_NUM)
.build()

// This method is not idempotent. After fetching a result set of size fetchSize each time, the corresponding results will be removed from the cache.
def fetchResultSetOfDataFrame(taskId: String, fetchSize: Int): DataFrameResponse = {
val df = DirectPushCache.resultSet.getIfPresent(taskId)
if (df == null) {
throw new IllegalAccessException(s"Task $taskId not exists in resultSet cache.")
} else {
val batchDf = df.limit(fetchSize)
if (batchDf.count() < fetchSize) {
// All the data in df has been consumed.
DirectPushCache.resultSet.invalidate(taskId)
DataFrameResponse(batchDf, hasMoreData = false)
} else {
// Update df with consumed one.
DirectPushCache.resultSet.put(taskId, df.except(batchDf))
DataFrameResponse(batchDf, hasMoreData = true)
}
}
}

def isTaskCached(taskId: String): Boolean = {
DirectPushCache.resultSet.getIfPresent(taskId) != null
}

def submitExecuteResult(taskId: String, df: DataFrame): Unit = {
DirectPushCache.resultSet.put(taskId, df)
}

}
Loading
Loading