-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Add AutoDetectTextExtractor * Add bytedeco ImageTextExtractor * Add PDF text extractor * Add max length validation * Add white chars filter * Add minimum word length validator
- Loading branch information
1 parent
fd34656
commit 4355720
Showing
12 changed files
with
219 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
#Mon May 04 21:03:43 CEST 2020 | ||
distributionUrl=https\://services.gradle.org/distributions/gradle-4.10.2-all.zip | ||
distributionBase=GRADLE_USER_HOME | ||
distributionPath=wrapper/dists | ||
distributionUrl=https\://services.gradle.org/distributions/gradle-4.10.2-bin.zip | ||
zipStoreBase=GRADLE_USER_HOME | ||
zipStorePath=wrapper/dists |
16 changes: 16 additions & 0 deletions
16
src/main/kotlin/pl/edu/uj/ii/ksi/mordor/configuration/TessBaseConfig.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
package pl.edu.uj.ii.ksi.mordor.configuration | ||
|
||
import org.bytedeco.tesseract.TessBaseAPI | ||
import org.springframework.context.annotation.Bean | ||
import org.springframework.context.annotation.Configuration | ||
|
||
@Configuration | ||
class TessBaseConfig { | ||
@Bean | ||
fun tessBaseAPI(): TessBaseAPI { | ||
val api = TessBaseAPI() | ||
api.Init("./src/main/resources/tessdata/", "eng") | ||
api.Init("./src/main/resources/tessdata/", "pol") | ||
return api | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
67 changes: 67 additions & 0 deletions
67
src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/text/extractor/AutoDetectTextExtractor.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
package pl.edu.uj.ii.ksi.mordor.services.text.extractor | ||
|
||
import java.io.File | ||
import java.io.IOException | ||
import java.lang.StringBuilder | ||
import org.apache.tika.Tika | ||
import org.bytedeco.tesseract.TessBaseAPI | ||
import org.slf4j.LoggerFactory | ||
import org.springframework.stereotype.Service | ||
import pl.edu.uj.ii.ksi.mordor.services.repository.RepositoryService | ||
|
||
@Service | ||
class AutoDetectTextExtractor( | ||
private val tika: Tika, | ||
private val tessBaseAPI: TessBaseAPI | ||
) : FileTextExtractor { | ||
|
||
companion object { | ||
private val logger = LoggerFactory.getLogger(RepositoryService::class.java) | ||
} | ||
|
||
private val minWordLength = 4 | ||
|
||
override fun extract(file: File, maxLength: Int): String? { | ||
val content = cleanSmallWords(extractRaw(file, maxLength)) | ||
return if (FileContentValidator().isValid(content)) content else null | ||
} | ||
|
||
private fun extractRaw(file: File, maxLength: Int): String? { | ||
try { | ||
val tikaContent = TikaFileTextExtractor(tika).extract(file, maxLength) | ||
if (!isScanned(tikaContent)) { | ||
logger.info("Extracted text from " + file.absolutePath + " using Tika") | ||
return tikaContent | ||
} | ||
val type = tika.detect(file) | ||
if (type == "application/pdf") { | ||
logger.info("Extracted text from " + file.absolutePath + " using Bytedeco for PDF") | ||
return PDFTextExtractor(tessBaseAPI).extract(file, maxLength) | ||
} | ||
if (type.startsWith("image")) { | ||
logger.info("Extracted text from " + file.absolutePath + " using Bytedeco") | ||
return ImageTextExtractor(tessBaseAPI).extract(file, maxLength) | ||
} | ||
} catch (e: IOException) { | ||
logger.error("File can not be read: " + file.absolutePath, e) | ||
} | ||
return null | ||
} | ||
|
||
private fun isScanned(content: String?): Boolean { | ||
return content?.trim()?.isEmpty() ?: true | ||
} | ||
|
||
private fun cleanSmallWords(content: String?): String? { | ||
if (content == null) { | ||
return null | ||
} | ||
val result = StringBuilder() | ||
for (seq in content.split("\\s".toRegex())) { | ||
if (seq.isNotEmpty() && seq.length > minWordLength) { | ||
result.append("$seq ") | ||
} | ||
} | ||
return result.toString() | ||
} | ||
} |
26 changes: 26 additions & 0 deletions
26
src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/text/extractor/FileContentValidator.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
package pl.edu.uj.ii.ksi.mordor.services.text.extractor | ||
|
||
import org.slf4j.LoggerFactory | ||
import pl.edu.uj.ii.ksi.mordor.services.repository.RepositoryService | ||
|
||
class FileContentValidator { | ||
|
||
private val minAlphanumericsPercent = 0.6 | ||
|
||
companion object { | ||
private val logger = LoggerFactory.getLogger(RepositoryService::class.java) | ||
} | ||
|
||
fun isValid(content: String?): Boolean { | ||
return content.isNullOrEmpty() || whiteSpaceFilter(content) | ||
} | ||
|
||
private fun whiteSpaceFilter(content: String): Boolean { | ||
val letters = content.filter { c -> c.isLetterOrDigit() }.length | ||
if (letters.toFloat().div(content.length) < minAlphanumericsPercent) { | ||
logger.warn("Number of alphanumeric chars is less than 60%. OCR result will be turned to null") | ||
return false | ||
} | ||
return true | ||
} | ||
} |
2 changes: 1 addition & 1 deletion
2
.../ksi/mordor/services/FileTextExtractor.kt → ...vices/text/extractor/FileTextExtractor.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
18 changes: 18 additions & 0 deletions
18
src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/text/extractor/ImageTextExtractor.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
package pl.edu.uj.ii.ksi.mordor.services.text.extractor | ||
|
||
import java.io.File | ||
import org.bytedeco.leptonica.global.lept.pixRead | ||
import org.bytedeco.tesseract.TessBaseAPI | ||
|
||
class ImageTextExtractor(private val tessBaseAPI: TessBaseAPI) : FileTextExtractor { | ||
|
||
override fun extract(file: File, maxLength: Int): String? { | ||
tessBaseAPI.SetImage(pixRead(file.absolutePath)) | ||
|
||
val res = tessBaseAPI.GetUTF8Text().string.trimIndent() | ||
if (maxLength < 0) { | ||
return res | ||
} | ||
return res.take(maxLength) | ||
} | ||
} |
82 changes: 82 additions & 0 deletions
82
src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/text/extractor/PDFTextExtractor.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
package pl.edu.uj.ii.ksi.mordor.services.text.extractor | ||
|
||
import com.recognition.software.jdeskew.ImageDeskew | ||
import java.awt.image.BufferedImage | ||
import java.io.File | ||
import java.io.IOException | ||
import java.util.LinkedList | ||
import javax.imageio.ImageIO | ||
import net.sourceforge.tess4j.util.ImageHelper | ||
import org.apache.pdfbox.pdmodel.PDDocument | ||
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject | ||
import org.bytedeco.tesseract.TessBaseAPI | ||
import org.slf4j.LoggerFactory | ||
import org.springframework.stereotype.Service | ||
import pl.edu.uj.ii.ksi.mordor.services.repository.RepositoryService | ||
|
||
@Service | ||
class PDFTextExtractor(private val tessBaseAPI: TessBaseAPI) : FileTextExtractor { | ||
|
||
private val maxSkewAngle = 0.05 | ||
|
||
companion object { | ||
private val logger = LoggerFactory.getLogger(RepositoryService::class.java) | ||
} | ||
|
||
override fun extract(file: File, maxLength: Int): String? { | ||
val bufferedImages = formatPDF(file) | ||
if (bufferedImages.isEmpty()) { | ||
return null | ||
} | ||
|
||
val extracted = StringBuilder() | ||
val outputFile = File.createTempFile("temp", "jpg") | ||
for (image in bufferedImages) { | ||
if (maxLength >= 0 && extracted.length > maxLength) { | ||
break | ||
} | ||
|
||
try { | ||
ImageIO.write(correctTwisted(image), "jpg", outputFile) | ||
val text: String? = ImageTextExtractor(tessBaseAPI).extract(outputFile, maxLength) | ||
if (text != null) { | ||
extracted.append(text) | ||
} | ||
} catch (e: IOException) { | ||
logger.error("Could not retrieve text from " + file.absolutePath) | ||
} | ||
} | ||
outputFile.delete() | ||
|
||
return if (maxLength >= 0 && maxLength < extracted.length) { | ||
extracted.toString().substring(0, maxLength) | ||
} else { | ||
extracted.toString() | ||
} | ||
} | ||
|
||
private fun formatPDF(pdfFile: File): LinkedList<BufferedImage?> { | ||
val bufferedImages = LinkedList<BufferedImage?>() | ||
val doc: PDDocument = PDDocument.load(pdfFile) | ||
doc.use { | ||
for (page in doc.pages) { | ||
val resources = page.resources | ||
for (xObjectName in resources.xObjectNames) { | ||
val xObject = resources.getXObject(xObjectName) | ||
if (xObject is PDImageXObject) { | ||
bufferedImages.add(xObject.image) | ||
} | ||
} | ||
} | ||
} | ||
return bufferedImages | ||
} | ||
|
||
private fun correctTwisted(image: BufferedImage?): BufferedImage? { | ||
val imageSkewAngle = ImageDeskew(image).skewAngle | ||
if (kotlin.math.abs(imageSkewAngle) > maxSkewAngle) { | ||
return ImageHelper.rotateImage(ImageHelper.convertImageToGrayscale(image), -imageSkewAngle) | ||
} | ||
return ImageHelper.convertImageToGrayscale(image) | ||
} | ||
} |
2 changes: 1 addition & 1 deletion
2
.../mordor/services/TikaFileTextExtractor.kt → ...s/text/extractor/TikaFileTextExtractor.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
Binary file not shown.