-
Notifications
You must be signed in to change notification settings - Fork 2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
OCR #65
Merged
Merged
OCR #65
Changes from 13 commits
Commits
Show all changes
16 commits
Select commit
Hold shift + click to select a range
515e2a3
Add AutoDetectTextExtractor
yuliiabuchko e992e67
Reformat TextExtractor based on detekt errors
DominikWolek 3595d69
Fix review issues
yuliiabuchko 20e4064
Add bytedeco ImageTextExtractor
yuliiabuchko 0231468
Move text extractors to new package
yuliiabuchko 5ab69e6
Add PDF text extractor
yuliiabuchko 7cc1040
Fix temp file creation
yuliiabuchko 412d63d
Add max length validation
yuliiabuchko 38ed7b8
Add white chars filter
yuliiabuchko 6b1d48f
Add white chars filter
yuliiabuchko 93661b5
Merge remote-tracking branch 'origin/ocr_new' into ocr_new
yuliiabuchko 3d29716
Merge branch 'ms-1' into ocr_new
yuliiabuchko 2772416
Rename extractors, update validator
yuliiabuchko d99b486
Add minimum word length validator
yuliiabuchko b5f2d58
Add split using regex, review fixes
yuliiabuchko 2c83681
Merge branch 'ms-1' into ocr_new
yuliiabuchko File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
#Mon May 04 21:03:43 CEST 2020 | ||
distributionUrl=https\://services.gradle.org/distributions/gradle-4.10.2-all.zip | ||
distributionBase=GRADLE_USER_HOME | ||
distributionPath=wrapper/dists | ||
distributionUrl=https\://services.gradle.org/distributions/gradle-4.10.2-bin.zip | ||
zipStoreBase=GRADLE_USER_HOME | ||
zipStorePath=wrapper/dists |
16 changes: 16 additions & 0 deletions
16
src/main/kotlin/pl/edu/uj/ii/ksi/mordor/configuration/TessBaseConfig.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
package pl.edu.uj.ii.ksi.mordor.configuration | ||
|
||
import org.bytedeco.tesseract.TessBaseAPI | ||
import org.springframework.context.annotation.Bean | ||
import org.springframework.context.annotation.Configuration | ||
|
||
@Configuration | ||
class TessBaseConfig { | ||
@Bean | ||
fun tessBaseAPI(): TessBaseAPI { | ||
val api = TessBaseAPI() | ||
api.Init("./src/main/resources/tessdata/", "eng") | ||
api.Init("./src/main/resources/tessdata/", "pol") | ||
return api | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
57 changes: 57 additions & 0 deletions
57
src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/text/extractor/AutoDetectTextExtractor.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
package pl.edu.uj.ii.ksi.mordor.services.text.extractor | ||
|
||
import java.io.File | ||
import java.io.IOException | ||
import org.apache.tika.Tika | ||
import org.bytedeco.tesseract.TessBaseAPI | ||
import org.slf4j.LoggerFactory | ||
import org.springframework.stereotype.Service | ||
import pl.edu.uj.ii.ksi.mordor.services.repository.RepositoryService | ||
|
||
@Service | ||
class AutoDetectTextExtractor( | ||
private val tika: Tika, | ||
private val tessBaseAPI: TessBaseAPI | ||
) : FileTextExtractor { | ||
|
||
companion object { | ||
private val logger = LoggerFactory.getLogger(RepositoryService::class.java) | ||
} | ||
|
||
override fun extract(file: File, maxLength: Int): String? { | ||
val content = extractRaw(file, maxLength) | ||
if (FileContentValidator().isValid(content)) { | ||
return content | ||
} | ||
return null | ||
} | ||
|
||
private fun extractRaw(file: File, maxLength: Int): String? { | ||
try { | ||
val tikaContent = TikaFileTextExtractor(tika).extract(file, maxLength) | ||
if (!isScanned(tikaContent)) { | ||
logger.info("Extracted text from " + file.absolutePath + " using Tika") | ||
return tikaContent | ||
} | ||
val type = tika.detect(file) | ||
if (type == "application/pdf") { | ||
logger.info("Extracted text from " + file.absolutePath + " using Bytedeco for PDF") | ||
return PDFTextExtractor(tessBaseAPI).extract(file, maxLength) | ||
} | ||
if (type.startsWith("image")) { | ||
logger.info("Extracted text from " + file.absolutePath + " using Bytedeco") | ||
return ImageTextExtractor(tessBaseAPI).extract(file, maxLength) | ||
} | ||
} catch (e: IOException) { | ||
logger.error("File can not be read", e) | ||
} | ||
return null | ||
} | ||
|
||
private fun isScanned(content: String?): Boolean { | ||
if (content == null) { | ||
return true | ||
} | ||
return content.trim().isEmpty() | ||
} | ||
} |
29 changes: 29 additions & 0 deletions
29
src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/text/extractor/FileContentValidator.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
package pl.edu.uj.ii.ksi.mordor.services.text.extractor | ||
|
||
import org.slf4j.LoggerFactory | ||
import pl.edu.uj.ii.ksi.mordor.services.repository.RepositoryService | ||
|
||
class FileContentValidator { | ||
|
||
private val minAlphanumericsPercent = 0.5 | ||
|
||
companion object { | ||
private val logger = LoggerFactory.getLogger(RepositoryService::class.java) | ||
} | ||
|
||
fun isValid(content: String?): Boolean { | ||
if (content.isNullOrEmpty()) { | ||
return true | ||
} | ||
return whiteSpaceFilter(content) | ||
} | ||
|
||
private fun whiteSpaceFilter(content: String): Boolean { | ||
val letters = content.filter { c -> c.isLetterOrDigit() }.length | ||
if (letters.toFloat().div(content.length) < minAlphanumericsPercent) { | ||
logger.warn("Number of alphanumeric chars is less than 50%. OCR result will be turned to null") | ||
yuliiabuchko marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return false | ||
} | ||
return true | ||
} | ||
} |
2 changes: 1 addition & 1 deletion
2
.../ksi/mordor/services/FileTextExtractor.kt → ...vices/text/extractor/FileTextExtractor.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
20 changes: 20 additions & 0 deletions
20
src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/text/extractor/ImageTextExtractor.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
package pl.edu.uj.ii.ksi.mordor.services.text.extractor | ||
|
||
import java.io.File | ||
import org.bytedeco.leptonica.global.lept.pixRead | ||
import org.bytedeco.tesseract.TessBaseAPI | ||
|
||
class ImageTextExtractor(private val tessBaseAPI: TessBaseAPI) : FileTextExtractor { | ||
|
||
override fun extract(file: File, maxLength: Int): String? { | ||
tessBaseAPI.SetImage(pixRead(file.absolutePath)) | ||
|
||
val res = tessBaseAPI.GetUTF8Text().string.trimIndent() | ||
|
||
return if (maxLength >= 0 && maxLength < res.length) { | ||
yuliiabuchko marked this conversation as resolved.
Show resolved
Hide resolved
|
||
res.substring(0, maxLength) | ||
} else { | ||
res | ||
} | ||
} | ||
} |
82 changes: 82 additions & 0 deletions
82
src/main/kotlin/pl/edu/uj/ii/ksi/mordor/services/text/extractor/PDFTextExtractor.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
package pl.edu.uj.ii.ksi.mordor.services.text.extractor | ||
|
||
import com.recognition.software.jdeskew.ImageDeskew | ||
import java.awt.image.BufferedImage | ||
import java.io.File | ||
import java.io.IOException | ||
import java.util.LinkedList | ||
import javax.imageio.ImageIO | ||
import net.sourceforge.tess4j.util.ImageHelper | ||
import org.apache.pdfbox.pdmodel.PDDocument | ||
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject | ||
import org.bytedeco.tesseract.TessBaseAPI | ||
import org.slf4j.LoggerFactory | ||
import org.springframework.stereotype.Service | ||
import pl.edu.uj.ii.ksi.mordor.services.repository.RepositoryService | ||
|
||
@Service | ||
class PDFTextExtractor(private val tessBaseAPI: TessBaseAPI) : FileTextExtractor { | ||
|
||
private val maxSkewAngle = 0.05 | ||
|
||
companion object { | ||
private val logger = LoggerFactory.getLogger(RepositoryService::class.java) | ||
} | ||
|
||
override fun extract(file: File, maxLength: Int): String? { | ||
val bufferedImages = formatPDF(file) | ||
if (bufferedImages.isEmpty()) { | ||
return null | ||
} | ||
|
||
val extracted = StringBuilder() | ||
val outputFile = File.createTempFile("temp", "jpg") | ||
for (image in bufferedImages) { | ||
if (maxLength >= 0 && extracted.length > maxLength) { | ||
break | ||
} | ||
|
||
try { | ||
ImageIO.write(correctTwisted(image), "jpg", outputFile) | ||
val text: String? = ImageTextExtractor(tessBaseAPI).extract(outputFile, maxLength) | ||
if (text != null) { | ||
extracted.append(text) | ||
} | ||
} catch (e: IOException) { | ||
logger.error("Could not retrieve text from " + file.absolutePath) | ||
} | ||
} | ||
outputFile.delete() | ||
|
||
return if (maxLength >= 0 && maxLength < extracted.length) { | ||
extracted.toString().substring(0, maxLength) | ||
} else { | ||
extracted.toString() | ||
} | ||
} | ||
|
||
private fun formatPDF(pdfFile: File): LinkedList<BufferedImage?> { | ||
val bufferedImages = LinkedList<BufferedImage?>() | ||
val doc: PDDocument = PDDocument.load(pdfFile) | ||
doc.use { | ||
for (page in doc.pages) { | ||
val resources = page.resources | ||
for (xObjectName in resources.xObjectNames) { | ||
val xObject = resources.getXObject(xObjectName) | ||
if (xObject is PDImageXObject) { | ||
bufferedImages.add(xObject.image) | ||
} | ||
} | ||
} | ||
} | ||
return bufferedImages | ||
} | ||
|
||
private fun correctTwisted(image: BufferedImage?): BufferedImage? { | ||
val imageSkewAngle = ImageDeskew(image).skewAngle | ||
if (kotlin.math.abs(imageSkewAngle) > maxSkewAngle) { | ||
return ImageHelper.rotateImage(ImageHelper.convertImageToGrayscale(image), -imageSkewAngle) | ||
} | ||
return ImageHelper.convertImageToGrayscale(image) | ||
} | ||
} |
2 changes: 1 addition & 1 deletion
2
.../mordor/services/TikaFileTextExtractor.kt → ...s/text/extractor/TikaFileTextExtractor.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
Binary file not shown.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.