OCR (#65)

* Add AutoDetectTextExtractor * Add bytedeco ImageTextExtractor * Add PDF text extractor * Add max length validation * Add white chars filter * Add minimum word length validator
KSIUJ · May 14, 2020 · 4355720 · 4355720
1 parent fd34656
commit 4355720
Show file tree

Hide file tree

Showing 12 changed files with 219 additions and 5 deletions.
diff --git a/build.gradle b/build.gradle
@@ -70,6 +70,9 @@ dependencies {
     compile("org.glassfish.jaxb:jaxb-runtime:2.3.2")
     compile("commons-io:commons-io:2.6")
     compile("org.apache.tika:tika-parsers:1.24")
+    compile("org.apache.pdfbox:pdfbox:2.0.19")
+    compile("net.sourceforge.tess4j:tess4j:4.5.1")
+    compile("org.bytedeco:tesseract-platform:4.1.1-1.5.3")
 
     // Remove devtools for release
     runtime('org.springframework.boot:spring-boot-devtools')

diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties
@@ -1,5 +1,5 @@
+#Mon May 04 21:03:43 CEST 2020
+distributionUrl=https\://services.gradle.org/distributions/gradle-4.10.2-all.zip
 distributionBase=GRADLE_USER_HOME
 distributionPath=wrapper/dists
-distributionUrl=https\://services.gradle.org/distributions/gradle-4.10.2-bin.zip
-zipStoreBase=GRADLE_USER_HOME
 zipStorePath=wrapper/dists
diff --git a/src/main/kotlin/pl/edu/uj/ii/ksi/mordor/configuration/TessBaseConfig.kt b/src/main/kotlin/pl/edu/uj/ii/ksi/mordor/configuration/TessBaseConfig.kt
@@ -0,0 +1,16 @@
+package pl.edu.uj.ii.ksi.mordor.configuration
+
+import org.bytedeco.tesseract.TessBaseAPI
+import org.springframework.context.annotation.Bean
+import org.springframework.context.annotation.Configuration
+
+@Configuration
+class TessBaseConfig {
+    @Bean
+    fun tessBaseAPI(): TessBaseAPI {
+        val api = TessBaseAPI()
+        api.Init("./src/main/resources/tessdata/", "eng")
+        api.Init("./src/main/resources/tessdata/", "pol")
+        return api
+    }
+}
diff --git a/src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/FileEntryCreator.kt b/src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/FileEntryCreator.kt
@@ -2,19 +2,21 @@ package pl.edu.uj.ii.ksi.mordor.services
 
 import java.io.File
 import javax.persistence.EntityManager
+import org.springframework.beans.factory.annotation.Qualifier
 import org.springframework.stereotype.Service
 import org.springframework.transaction.annotation.Transactional
 import pl.edu.uj.ii.ksi.mordor.persistence.entities.FileContent
 import pl.edu.uj.ii.ksi.mordor.persistence.entities.FileEntry
 import pl.edu.uj.ii.ksi.mordor.persistence.entities.FileMetadata
 import pl.edu.uj.ii.ksi.mordor.persistence.repositories.FileMetadataRepository
 import pl.edu.uj.ii.ksi.mordor.services.hash.FileHashProvider
+import pl.edu.uj.ii.ksi.mordor.services.text.extractor.FileTextExtractor
 
 @Service
 class FileEntryCreator(
     private val metadataExtractor: MetadataExtractor,
     private val entityManager: EntityManager,
-    private val fileTextExtractor: FileTextExtractor,
+    @Qualifier("autoDetectTextExtractor") private val fileTextExtractor: FileTextExtractor,
     private val hashProvider: FileHashProvider,
     private val metadataRepository: FileMetadataRepository
 ) {

diff --git a/src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/text/extractor/AutoDetectTextExtractor.kt b/src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/text/extractor/AutoDetectTextExtractor.kt
@@ -0,0 +1,67 @@
+package pl.edu.uj.ii.ksi.mordor.services.text.extractor
+
+import java.io.File
+import java.io.IOException
+import java.lang.StringBuilder
+import org.apache.tika.Tika
+import org.bytedeco.tesseract.TessBaseAPI
+import org.slf4j.LoggerFactory
+import org.springframework.stereotype.Service
+import pl.edu.uj.ii.ksi.mordor.services.repository.RepositoryService
+
+@Service
+class AutoDetectTextExtractor(
+    private val tika: Tika,
+    private val tessBaseAPI: TessBaseAPI
+) : FileTextExtractor {
+
+    companion object {
+        private val logger = LoggerFactory.getLogger(RepositoryService::class.java)
+    }
+
+    private val minWordLength = 4
+
+    override fun extract(file: File, maxLength: Int): String? {
+        val content = cleanSmallWords(extractRaw(file, maxLength))
+        return if (FileContentValidator().isValid(content)) content else null
+    }
+
+    private fun extractRaw(file: File, maxLength: Int): String? {
+        try {
+            val tikaContent = TikaFileTextExtractor(tika).extract(file, maxLength)
+            if (!isScanned(tikaContent)) {
+                logger.info("Extracted text from " + file.absolutePath + " using Tika")
+                return tikaContent
+            }
+            val type = tika.detect(file)
+            if (type == "application/pdf") {
+                logger.info("Extracted text from " + file.absolutePath + " using Bytedeco for PDF")
+                return PDFTextExtractor(tessBaseAPI).extract(file, maxLength)
+            }
+            if (type.startsWith("image")) {
+                logger.info("Extracted text from " + file.absolutePath + " using Bytedeco")
+                return ImageTextExtractor(tessBaseAPI).extract(file, maxLength)
+            }
+        } catch (e: IOException) {
+            logger.error("File can not be read: " + file.absolutePath, e)
+        }
+        return null
+    }
+
+    private fun isScanned(content: String?): Boolean {
+        return content?.trim()?.isEmpty() ?: true
+    }
+
+    private fun cleanSmallWords(content: String?): String? {
+        if (content == null) {
+            return null
+        }
+        val result = StringBuilder()
+        for (seq in content.split("\\s".toRegex())) {
+            if (seq.isNotEmpty() && seq.length > minWordLength) {
+                result.append("$seq ")
+            }
+        }
+        return result.toString()
+    }
+}
diff --git a/src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/text/extractor/FileContentValidator.kt b/src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/text/extractor/FileContentValidator.kt
@@ -0,0 +1,26 @@
+package pl.edu.uj.ii.ksi.mordor.services.text.extractor
+
+import org.slf4j.LoggerFactory
+import pl.edu.uj.ii.ksi.mordor.services.repository.RepositoryService
+
+class FileContentValidator {
+
+    private val minAlphanumericsPercent = 0.6
+
+    companion object {
+        private val logger = LoggerFactory.getLogger(RepositoryService::class.java)
+    }
+
+    fun isValid(content: String?): Boolean {
+        return content.isNullOrEmpty() || whiteSpaceFilter(content)
+    }
+
+    private fun whiteSpaceFilter(content: String): Boolean {
+        val letters = content.filter { c -> c.isLetterOrDigit() }.length
+        if (letters.toFloat().div(content.length) < minAlphanumericsPercent) {
+            logger.warn("Number of alphanumeric chars is less than 60%. OCR result will be turned to null")
+            return false
+        }
+        return true
+    }
+}
diff --git a/.../ksi/mordor/services/FileTextExtractor.kt → ...vices/text/extractor/FileTextExtractor.kt b/.../ksi/mordor/services/FileTextExtractor.kt → ...vices/text/extractor/FileTextExtractor.kt
@@ -1,4 +1,4 @@
-package pl.edu.uj.ii.ksi.mordor.services
+package pl.edu.uj.ii.ksi.mordor.services.text.extractor
 
 import java.io.File
 

diff --git a/src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/text/extractor/ImageTextExtractor.kt b/src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/text/extractor/ImageTextExtractor.kt
@@ -0,0 +1,18 @@
+package pl.edu.uj.ii.ksi.mordor.services.text.extractor
+
+import java.io.File
+import org.bytedeco.leptonica.global.lept.pixRead
+import org.bytedeco.tesseract.TessBaseAPI
+
+class ImageTextExtractor(private val tessBaseAPI: TessBaseAPI) : FileTextExtractor {
+
+    override fun extract(file: File, maxLength: Int): String? {
+        tessBaseAPI.SetImage(pixRead(file.absolutePath))
+
+        val res = tessBaseAPI.GetUTF8Text().string.trimIndent()
+        if (maxLength < 0) {
+            return res
+        }
+        return res.take(maxLength)
+    }
+}
diff --git a/src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/text/extractor/PDFTextExtractor.kt b/src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/text/extractor/PDFTextExtractor.kt
@@ -0,0 +1,82 @@
+package pl.edu.uj.ii.ksi.mordor.services.text.extractor
+
+import com.recognition.software.jdeskew.ImageDeskew
+import java.awt.image.BufferedImage
+import java.io.File
+import java.io.IOException
+import java.util.LinkedList
+import javax.imageio.ImageIO
+import net.sourceforge.tess4j.util.ImageHelper
+import org.apache.pdfbox.pdmodel.PDDocument
+import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject
+import org.bytedeco.tesseract.TessBaseAPI
+import org.slf4j.LoggerFactory
+import org.springframework.stereotype.Service
+import pl.edu.uj.ii.ksi.mordor.services.repository.RepositoryService
+
+@Service
+class PDFTextExtractor(private val tessBaseAPI: TessBaseAPI) : FileTextExtractor {
+
+    private val maxSkewAngle = 0.05
+
+    companion object {
+        private val logger = LoggerFactory.getLogger(RepositoryService::class.java)
+    }
+
+    override fun extract(file: File, maxLength: Int): String? {
+        val bufferedImages = formatPDF(file)
+        if (bufferedImages.isEmpty()) {
+            return null
+        }
+
+        val extracted = StringBuilder()
+        val outputFile = File.createTempFile("temp", "jpg")
+        for (image in bufferedImages) {
+            if (maxLength >= 0 && extracted.length > maxLength) {
+                break
+            }
+
+            try {
+                ImageIO.write(correctTwisted(image), "jpg", outputFile)
+                val text: String? = ImageTextExtractor(tessBaseAPI).extract(outputFile, maxLength)
+                if (text != null) {
+                    extracted.append(text)
+                }
+            } catch (e: IOException) {
+                logger.error("Could not retrieve text from " + file.absolutePath)
+            }
+        }
+        outputFile.delete()
+
+        return if (maxLength >= 0 && maxLength < extracted.length) {
+            extracted.toString().substring(0, maxLength)
+        } else {
+            extracted.toString()
+        }
+    }
+
+    private fun formatPDF(pdfFile: File): LinkedList<BufferedImage?> {
+        val bufferedImages = LinkedList<BufferedImage?>()
+        val doc: PDDocument = PDDocument.load(pdfFile)
+        doc.use {
+            for (page in doc.pages) {
+                val resources = page.resources
+                for (xObjectName in resources.xObjectNames) {
+                    val xObject = resources.getXObject(xObjectName)
+                    if (xObject is PDImageXObject) {
+                        bufferedImages.add(xObject.image)
+                    }
+                }
+            }
+        }
+        return bufferedImages
+    }
+
+    private fun correctTwisted(image: BufferedImage?): BufferedImage? {
+        val imageSkewAngle = ImageDeskew(image).skewAngle
+        if (kotlin.math.abs(imageSkewAngle) > maxSkewAngle) {
+            return ImageHelper.rotateImage(ImageHelper.convertImageToGrayscale(image), -imageSkewAngle)
+        }
+        return ImageHelper.convertImageToGrayscale(image)
+    }
+}
diff --git a/.../mordor/services/TikaFileTextExtractor.kt → ...s/text/extractor/TikaFileTextExtractor.kt b/.../mordor/services/TikaFileTextExtractor.kt → ...s/text/extractor/TikaFileTextExtractor.kt
@@ -1,4 +1,4 @@
-package pl.edu.uj.ii.ksi.mordor.services
+package pl.edu.uj.ii.ksi.mordor.services.text.extractor
 
 import java.io.File
 import java.io.IOException

diff --git a/src/main/resources/tessdata/eng.traineddata b/src/main/resources/tessdata/eng.traineddata
diff --git a/src/main/resources/tessdata/pol.traineddata b/src/main/resources/tessdata/pol.traineddata