diff --git a/build.gradle b/build.gradle index ddfe7f2..114c2cd 100644 --- a/build.gradle +++ b/build.gradle @@ -70,6 +70,9 @@ dependencies { compile("org.glassfish.jaxb:jaxb-runtime:2.3.2") compile("commons-io:commons-io:2.6") compile("org.apache.tika:tika-parsers:1.24") + compile("org.apache.pdfbox:pdfbox:2.0.19") + compile("net.sourceforge.tess4j:tess4j:4.5.1") + compile("org.bytedeco:tesseract-platform:4.1.1-1.5.3") // Remove devtools for release runtime('org.springframework.boot:spring-boot-devtools') diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties index e0b3fb8..c9d3a09 100644 --- a/gradle/wrapper/gradle-wrapper.properties +++ b/gradle/wrapper/gradle-wrapper.properties @@ -1,5 +1,5 @@ +#Mon May 04 21:03:43 CEST 2020 +distributionUrl=https\://services.gradle.org/distributions/gradle-4.10.2-all.zip distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists -distributionUrl=https\://services.gradle.org/distributions/gradle-4.10.2-bin.zip -zipStoreBase=GRADLE_USER_HOME zipStorePath=wrapper/dists diff --git a/src/main/kotlin/pl/edu/uj/ii/ksi/mordor/configuration/TessBaseConfig.kt b/src/main/kotlin/pl/edu/uj/ii/ksi/mordor/configuration/TessBaseConfig.kt new file mode 100644 index 0000000..136f5fd --- /dev/null +++ b/src/main/kotlin/pl/edu/uj/ii/ksi/mordor/configuration/TessBaseConfig.kt @@ -0,0 +1,16 @@ +package pl.edu.uj.ii.ksi.mordor.configuration + +import org.bytedeco.tesseract.TessBaseAPI +import org.springframework.context.annotation.Bean +import org.springframework.context.annotation.Configuration + +@Configuration +class TessBaseConfig { + @Bean + fun tessBaseAPI(): TessBaseAPI { + val api = TessBaseAPI() + api.Init("./src/main/resources/tessdata/", "eng") + api.Init("./src/main/resources/tessdata/", "pol") + return api + } +} diff --git a/src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/FileEntryCreator.kt b/src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/FileEntryCreator.kt index b45e50c..4477e0e 100644 --- a/src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/FileEntryCreator.kt +++ b/src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/FileEntryCreator.kt @@ -2,6 +2,7 @@ package pl.edu.uj.ii.ksi.mordor.services import java.io.File import javax.persistence.EntityManager +import org.springframework.beans.factory.annotation.Qualifier import org.springframework.stereotype.Service import org.springframework.transaction.annotation.Transactional import pl.edu.uj.ii.ksi.mordor.persistence.entities.FileContent @@ -9,12 +10,13 @@ import pl.edu.uj.ii.ksi.mordor.persistence.entities.FileEntry import pl.edu.uj.ii.ksi.mordor.persistence.entities.FileMetadata import pl.edu.uj.ii.ksi.mordor.persistence.repositories.FileMetadataRepository import pl.edu.uj.ii.ksi.mordor.services.hash.FileHashProvider +import pl.edu.uj.ii.ksi.mordor.services.text.extractor.FileTextExtractor @Service class FileEntryCreator( private val metadataExtractor: MetadataExtractor, private val entityManager: EntityManager, - private val fileTextExtractor: FileTextExtractor, + @Qualifier("autoDetectTextExtractor") private val fileTextExtractor: FileTextExtractor, private val hashProvider: FileHashProvider, private val metadataRepository: FileMetadataRepository ) { diff --git a/src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/text/extractor/AutoDetectTextExtractor.kt b/src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/text/extractor/AutoDetectTextExtractor.kt new file mode 100644 index 0000000..77d8f76 --- /dev/null +++ b/src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/text/extractor/AutoDetectTextExtractor.kt @@ -0,0 +1,67 @@ +package pl.edu.uj.ii.ksi.mordor.services.text.extractor + +import java.io.File +import java.io.IOException +import java.lang.StringBuilder +import org.apache.tika.Tika +import org.bytedeco.tesseract.TessBaseAPI +import org.slf4j.LoggerFactory +import org.springframework.stereotype.Service +import pl.edu.uj.ii.ksi.mordor.services.repository.RepositoryService + +@Service +class AutoDetectTextExtractor( + private val tika: Tika, + private val tessBaseAPI: TessBaseAPI +) : FileTextExtractor { + + companion object { + private val logger = LoggerFactory.getLogger(RepositoryService::class.java) + } + + private val minWordLength = 4 + + override fun extract(file: File, maxLength: Int): String? { + val content = cleanSmallWords(extractRaw(file, maxLength)) + return if (FileContentValidator().isValid(content)) content else null + } + + private fun extractRaw(file: File, maxLength: Int): String? { + try { + val tikaContent = TikaFileTextExtractor(tika).extract(file, maxLength) + if (!isScanned(tikaContent)) { + logger.info("Extracted text from " + file.absolutePath + " using Tika") + return tikaContent + } + val type = tika.detect(file) + if (type == "application/pdf") { + logger.info("Extracted text from " + file.absolutePath + " using Bytedeco for PDF") + return PDFTextExtractor(tessBaseAPI).extract(file, maxLength) + } + if (type.startsWith("image")) { + logger.info("Extracted text from " + file.absolutePath + " using Bytedeco") + return ImageTextExtractor(tessBaseAPI).extract(file, maxLength) + } + } catch (e: IOException) { + logger.error("File can not be read: " + file.absolutePath, e) + } + return null + } + + private fun isScanned(content: String?): Boolean { + return content?.trim()?.isEmpty() ?: true + } + + private fun cleanSmallWords(content: String?): String? { + if (content == null) { + return null + } + val result = StringBuilder() + for (seq in content.split("\\s".toRegex())) { + if (seq.isNotEmpty() && seq.length > minWordLength) { + result.append("$seq ") + } + } + return result.toString() + } +} diff --git a/src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/text/extractor/FileContentValidator.kt b/src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/text/extractor/FileContentValidator.kt new file mode 100644 index 0000000..fea1e6f --- /dev/null +++ b/src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/text/extractor/FileContentValidator.kt @@ -0,0 +1,26 @@ +package pl.edu.uj.ii.ksi.mordor.services.text.extractor + +import org.slf4j.LoggerFactory +import pl.edu.uj.ii.ksi.mordor.services.repository.RepositoryService + +class FileContentValidator { + + private val minAlphanumericsPercent = 0.6 + + companion object { + private val logger = LoggerFactory.getLogger(RepositoryService::class.java) + } + + fun isValid(content: String?): Boolean { + return content.isNullOrEmpty() || whiteSpaceFilter(content) + } + + private fun whiteSpaceFilter(content: String): Boolean { + val letters = content.filter { c -> c.isLetterOrDigit() }.length + if (letters.toFloat().div(content.length) < minAlphanumericsPercent) { + logger.warn("Number of alphanumeric chars is less than 60%. OCR result will be turned to null") + return false + } + return true + } +} diff --git a/src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/FileTextExtractor.kt b/src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/text/extractor/FileTextExtractor.kt similarity index 66% rename from src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/FileTextExtractor.kt rename to src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/text/extractor/FileTextExtractor.kt index 2d193a9..a3b554d 100644 --- a/src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/FileTextExtractor.kt +++ b/src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/text/extractor/FileTextExtractor.kt @@ -1,4 +1,4 @@ -package pl.edu.uj.ii.ksi.mordor.services +package pl.edu.uj.ii.ksi.mordor.services.text.extractor import java.io.File diff --git a/src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/text/extractor/ImageTextExtractor.kt b/src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/text/extractor/ImageTextExtractor.kt new file mode 100644 index 0000000..0ceb9cc --- /dev/null +++ b/src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/text/extractor/ImageTextExtractor.kt @@ -0,0 +1,18 @@ +package pl.edu.uj.ii.ksi.mordor.services.text.extractor + +import java.io.File +import org.bytedeco.leptonica.global.lept.pixRead +import org.bytedeco.tesseract.TessBaseAPI + +class ImageTextExtractor(private val tessBaseAPI: TessBaseAPI) : FileTextExtractor { + + override fun extract(file: File, maxLength: Int): String? { + tessBaseAPI.SetImage(pixRead(file.absolutePath)) + + val res = tessBaseAPI.GetUTF8Text().string.trimIndent() + if (maxLength < 0) { + return res + } + return res.take(maxLength) + } +} diff --git a/src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/text/extractor/PDFTextExtractor.kt b/src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/text/extractor/PDFTextExtractor.kt new file mode 100644 index 0000000..ea4c1f6 --- /dev/null +++ b/src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/text/extractor/PDFTextExtractor.kt @@ -0,0 +1,82 @@ +package pl.edu.uj.ii.ksi.mordor.services.text.extractor + +import com.recognition.software.jdeskew.ImageDeskew +import java.awt.image.BufferedImage +import java.io.File +import java.io.IOException +import java.util.LinkedList +import javax.imageio.ImageIO +import net.sourceforge.tess4j.util.ImageHelper +import org.apache.pdfbox.pdmodel.PDDocument +import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject +import org.bytedeco.tesseract.TessBaseAPI +import org.slf4j.LoggerFactory +import org.springframework.stereotype.Service +import pl.edu.uj.ii.ksi.mordor.services.repository.RepositoryService + +@Service +class PDFTextExtractor(private val tessBaseAPI: TessBaseAPI) : FileTextExtractor { + + private val maxSkewAngle = 0.05 + + companion object { + private val logger = LoggerFactory.getLogger(RepositoryService::class.java) + } + + override fun extract(file: File, maxLength: Int): String? { + val bufferedImages = formatPDF(file) + if (bufferedImages.isEmpty()) { + return null + } + + val extracted = StringBuilder() + val outputFile = File.createTempFile("temp", "jpg") + for (image in bufferedImages) { + if (maxLength >= 0 && extracted.length > maxLength) { + break + } + + try { + ImageIO.write(correctTwisted(image), "jpg", outputFile) + val text: String? = ImageTextExtractor(tessBaseAPI).extract(outputFile, maxLength) + if (text != null) { + extracted.append(text) + } + } catch (e: IOException) { + logger.error("Could not retrieve text from " + file.absolutePath) + } + } + outputFile.delete() + + return if (maxLength >= 0 && maxLength < extracted.length) { + extracted.toString().substring(0, maxLength) + } else { + extracted.toString() + } + } + + private fun formatPDF(pdfFile: File): LinkedList { + val bufferedImages = LinkedList() + val doc: PDDocument = PDDocument.load(pdfFile) + doc.use { + for (page in doc.pages) { + val resources = page.resources + for (xObjectName in resources.xObjectNames) { + val xObject = resources.getXObject(xObjectName) + if (xObject is PDImageXObject) { + bufferedImages.add(xObject.image) + } + } + } + } + return bufferedImages + } + + private fun correctTwisted(image: BufferedImage?): BufferedImage? { + val imageSkewAngle = ImageDeskew(image).skewAngle + if (kotlin.math.abs(imageSkewAngle) > maxSkewAngle) { + return ImageHelper.rotateImage(ImageHelper.convertImageToGrayscale(image), -imageSkewAngle) + } + return ImageHelper.convertImageToGrayscale(image) + } +} diff --git a/src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/TikaFileTextExtractor.kt b/src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/text/extractor/TikaFileTextExtractor.kt similarity index 94% rename from src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/TikaFileTextExtractor.kt rename to src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/text/extractor/TikaFileTextExtractor.kt index 7d74cd6..19810fa 100644 --- a/src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/TikaFileTextExtractor.kt +++ b/src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/text/extractor/TikaFileTextExtractor.kt @@ -1,4 +1,4 @@ -package pl.edu.uj.ii.ksi.mordor.services +package pl.edu.uj.ii.ksi.mordor.services.text.extractor import java.io.File import java.io.IOException diff --git a/src/main/resources/tessdata/eng.traineddata b/src/main/resources/tessdata/eng.traineddata new file mode 100644 index 0000000..f4744c2 Binary files /dev/null and b/src/main/resources/tessdata/eng.traineddata differ diff --git a/src/main/resources/tessdata/pol.traineddata b/src/main/resources/tessdata/pol.traineddata new file mode 100644 index 0000000..fba7958 Binary files /dev/null and b/src/main/resources/tessdata/pol.traineddata differ