-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2 from ghazi-naceur/csv-to-parquet
Csv to parquet conversion
- Loading branch information
Showing
51 changed files
with
619 additions
and
190 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,9 @@ | ||
route { | ||
type = xlsx-to-csv | ||
in = src/test/resources/xlsx-data/input | ||
out = src/test/resources/xlsx-data/output | ||
// type = xlsx-to-csv | ||
// in = src/test/resources/xlsx_to_csv-data/input | ||
// out = src/test/resources/xlsx_to_csv-data/output | ||
// | ||
type = csv-to-parquet | ||
in = src/test/resources/csv_to_parquet-data/input/ | ||
out = src/test/resources/csv_to_parquet-data/output/ | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
package io.oss.data.highway.utils | ||
|
||
import java.io.File | ||
|
||
import io.oss.data.highway.model.DataHighwayError.ReadFileError | ||
import cats.syntax.either._ | ||
|
||
object FilesUtils { | ||
|
||
/** | ||
* Gets files' names located in a provided path | ||
* | ||
* @param path The provided path | ||
* @return a list of files names without the extension | ||
*/ | ||
private[utils] def getFilesFromPath( | ||
path: String, | ||
extensions: Seq[String]): Either[ReadFileError, List[String]] = { | ||
Either | ||
.catchNonFatal { | ||
listFilesRecursively(new File(path), extensions).map(_.getPath).toList | ||
} | ||
.leftMap(thr => | ||
ReadFileError(thr.getMessage, thr.getCause, thr.getStackTrace)) | ||
} | ||
|
||
/** | ||
* Lists files recursively from a path | ||
* | ||
* @param path The provided path | ||
* @return a Seq of files | ||
*/ | ||
private[utils] def listFilesRecursively( | ||
path: File, | ||
extensions: Seq[String]): Seq[File] = { | ||
val files = path.listFiles | ||
val result = files | ||
.filter(_.isFile) | ||
.filter(file => { | ||
filterByExtension(file.getPath, extensions) | ||
}) | ||
result ++ | ||
files | ||
.filter(_.isDirectory) | ||
.flatMap(f => listFilesRecursively(f, extensions)) | ||
} | ||
|
||
/** | ||
* Checks that the provided file has an extension that belongs to the provided ones | ||
* | ||
* @param file The provided file | ||
* @param extensions The provided extensions | ||
* @return True if the file has a valid extension, otherwise False | ||
*/ | ||
private[utils] def filterByExtension(file: String, | ||
extensions: Seq[String]): Boolean = { | ||
val fileName = file.split("/").last | ||
extensions.contains(fileName.substring(fileName.lastIndexOf(".") + 1)) | ||
} | ||
|
||
/** | ||
* Lists folders recursively from a path | ||
* | ||
* @param path The provided path | ||
* @return a Seq of folders | ||
*/ | ||
private[utils] def listFoldersRecursively( | ||
path: String): Either[ReadFileError, List[String]] = { | ||
@scala.annotation.tailrec | ||
def getFolders(path: List[File], results: List[File]): Seq[File] = | ||
path match { | ||
case head :: tail => | ||
val files = head.listFiles | ||
val directories = files.filter(_.isDirectory) | ||
val updated = | ||
if (files.size == directories.length) results else head :: results | ||
getFolders(tail ++ directories, updated) | ||
case _ => results | ||
} | ||
|
||
Either | ||
.catchNonFatal { | ||
getFolders(new File(path) :: Nil, Nil).map(_.getPath).reverse.toList | ||
} | ||
.leftMap(thr => | ||
ReadFileError(thr.getMessage, thr.getCause, thr.getStackTrace)) | ||
} | ||
|
||
/** | ||
* Replaces each backslash by a slash | ||
* | ||
* @param path The provided path | ||
* @return a path with slash as file separator | ||
*/ | ||
def reversePathSeparator(path: String): String = | ||
path.replace("\\", "/") | ||
} |
86 changes: 86 additions & 0 deletions
86
src/main/scala/io/oss/data/highway/utils/ParquetHandler.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
package io.oss.data.highway.utils | ||
|
||
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession} | ||
import cats.implicits._ | ||
import io.oss.data.highway.model.DataHighwayError | ||
import io.oss.data.highway.model.DataHighwayError.{ParquetError, ReadFileError} | ||
|
||
object ParquetHandler { | ||
|
||
val ss: SparkSession = SparkSession | ||
.builder() | ||
.appName("parquet-handler") | ||
.master("local[*]") | ||
.getOrCreate() | ||
ss.sparkContext.setLogLevel("WARN") | ||
|
||
/** | ||
* Save a csv file as parquet | ||
* @param in The input csv path | ||
* @param out The generated parquet file path | ||
* @param columnSeparator The column separator for each line in the csv file | ||
* @param saveMode The file saving mode | ||
* @return Unit if successful, otherwise Error | ||
*/ | ||
def saveCsvAsParquet(in: String, | ||
out: String, | ||
columnSeparator: String, | ||
saveMode: SaveMode): Either[ParquetError, Unit] = { | ||
Either | ||
.catchNonFatal { | ||
ss.read | ||
.option("inferSchema", "true") | ||
.option("header", "true") | ||
.option("sep", columnSeparator) | ||
.csv(in) | ||
.write | ||
.mode(saveMode) | ||
.parquet(out) | ||
} | ||
.leftMap(thr => | ||
ParquetError(thr.getMessage, thr.getCause, thr.getStackTrace)) | ||
} | ||
|
||
/** | ||
* Reads parquet file | ||
* @param path The parquet file path | ||
* @return DataFrame, otherwise Error | ||
*/ | ||
def readParquet(path: String): Either[ParquetError, DataFrame] = { | ||
Either | ||
.catchNonFatal { | ||
ss.read.parquet(path) | ||
} | ||
.leftMap(thr => | ||
ParquetError(thr.getMessage, thr.getCause, thr.getStackTrace)) | ||
} | ||
|
||
/** | ||
* Converts csv files to parquet files | ||
* | ||
* @param in The input csv path | ||
* @param out The generated parquet file path | ||
* @param columnSeparator The column separator for each line in the csv file | ||
* @param saveMode The file saving mode | ||
* @return List[Unit], otherwise Error | ||
*/ | ||
def apply(in: String, | ||
out: String, | ||
columnSeparator: String, | ||
saveMode: SaveMode): Either[DataHighwayError, List[Unit]] = { | ||
for { | ||
folders <- FilesUtils.listFoldersRecursively(in) | ||
list <- folders | ||
.traverse(folder => { | ||
val suffix = FilesUtils.reversePathSeparator(folder).split("/").last | ||
ParquetHandler | ||
.saveCsvAsParquet(folder, | ||
s"$out/$suffix", | ||
columnSeparator, | ||
saveMode) | ||
}) | ||
.leftMap(error => | ||
ParquetError(error.message, error.cause, error.stacktrace)) | ||
} yield list | ||
} | ||
} |
Oops, something went wrong.