-
Notifications
You must be signed in to change notification settings - Fork 156
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #5238 from inception-project/feature/5237-Allow-tu…
…ning-PDF-import-parameters-via-settings #5237 - Allow tuning PDF import parameters via settings
- Loading branch information
Showing
14 changed files
with
554 additions
and
76 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
94 changes: 94 additions & 0 deletions
94
...or2/src/main/java/de/tudarmstadt/ukp/inception/pdfeditor2/config/PdfFormatProperties.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
/* | ||
* Licensed to the Technische Universität Darmstadt under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The Technische Universität Darmstadt | ||
* licenses this file to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package de.tudarmstadt.ukp.inception.pdfeditor2.config; | ||
|
||
import de.tudarmstadt.ukp.inception.pdfeditor2.visual.VisualPDFTextStripper; | ||
|
||
public interface PdfFormatProperties | ||
{ | ||
/** | ||
* @return Whether to sort the text as it appears on screen or leave it as it appears in the PDF | ||
* file. This may help with PDFs generated by writers that output text sorted by style. | ||
* However, it may cause problems with other types of PDFs, e.g. such that contain | ||
* watermark text in the background. | ||
* | ||
* @see VisualPDFTextStripper#setSortByPosition(boolean) | ||
*/ | ||
boolean isSortByPosition(); | ||
|
||
/** | ||
* @return Whether to suppress duplicate overlapping text. By default the text stripper will | ||
* attempt to remove text that overlapps each other. Word paints the same character | ||
* several times in order to make it look bold. By setting this to false all text will | ||
* be extracted, which means that certain sections will be duplicated, but better | ||
* performance will be noticed. | ||
* | ||
* @see VisualPDFTextStripper#setSuppressDuplicateOverlappingText(boolean) | ||
*/ | ||
boolean isSuppressDuplicateOverlappingText(); | ||
|
||
/** | ||
* @return whether the text stripper should group the text output by a list of beads. | ||
* | ||
* @see VisualPDFTextStripper#setShouldSeparateByBeads(boolean) | ||
*/ | ||
boolean isShouldSeparateByBeads(); | ||
|
||
/** | ||
* @return whether There will some additional text formatting be added. | ||
* | ||
* @see VisualPDFTextStripper#setAddMoreFormatting(boolean) | ||
*/ | ||
boolean isAddMoreFormatting(); | ||
|
||
/** | ||
* @return the multiple of whitespace character widths for the current text which the current | ||
* line start can be indented from the previous line start beyond which the current line | ||
* start is considered to be a paragraph start. | ||
* | ||
* @see VisualPDFTextStripper#setIndentThreshold(float) | ||
*/ | ||
float getIndentThreshold(); | ||
|
||
/** | ||
* @return the minimum whitespace, as a multiple of the max height of the current characters | ||
* beyond which the current line start is considered to be a paragraph start. | ||
* | ||
* @see VisualPDFTextStripper#setDropThreshold(float) | ||
*/ | ||
float getDropThreshold(); | ||
|
||
/** | ||
* @return the character width-based tolerance value that is used to estimate where spaces in | ||
* text should be added. Note that the default value for this has been determined from | ||
* trial and error. Setting this value larger will reduce the number of spaces added. | ||
* | ||
* @see VisualPDFTextStripper#setAverageCharTolerance(float) | ||
*/ | ||
float getAverageCharTolerance(); | ||
|
||
/** | ||
* @return the space width-based tolerance value that is used to estimate where spaces in text | ||
* should be added. Note that the default value for this has been determined from trial | ||
* and error. Setting this value larger will reduce the number of spaces added. | ||
* | ||
* @see VisualPDFTextStripper#setSpacingTolerance(float) | ||
*/ | ||
float getSpacingTolerance(); | ||
|
||
} |
124 changes: 124 additions & 0 deletions
124
...src/main/java/de/tudarmstadt/ukp/inception/pdfeditor2/config/PdfFormatPropertiesImpl.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
/* | ||
* Licensed to the Technische Universität Darmstadt under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The Technische Universität Darmstadt | ||
* licenses this file to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package de.tudarmstadt.ukp.inception.pdfeditor2.config; | ||
|
||
import org.springframework.boot.context.properties.ConfigurationProperties; | ||
|
||
@ConfigurationProperties("format.pdf") | ||
public class PdfFormatPropertiesImpl | ||
implements PdfFormatProperties | ||
{ | ||
private boolean sortByPosition = false; | ||
private boolean suppressDuplicateOverlappingText = true; | ||
private boolean shouldSeparateByBeads = true; | ||
|
||
private boolean addMoreFormatting = true; | ||
private float indentThreshold = 2.0f; | ||
private float dropThreshold = 2.5f; | ||
|
||
private float averageCharTolerance = 0.3f; | ||
private float spacingTolerance = 0.5f; | ||
|
||
@Override | ||
public boolean isSortByPosition() | ||
{ | ||
return sortByPosition; | ||
} | ||
|
||
public void setSortByPosition(boolean aSortByPosition) | ||
{ | ||
sortByPosition = aSortByPosition; | ||
} | ||
|
||
@Override | ||
public boolean isSuppressDuplicateOverlappingText() | ||
{ | ||
return suppressDuplicateOverlappingText; | ||
} | ||
|
||
public void setSuppressDuplicateOverlappingText(boolean aSuppressDuplicateOverlappingText) | ||
{ | ||
suppressDuplicateOverlappingText = aSuppressDuplicateOverlappingText; | ||
} | ||
|
||
@Override | ||
public boolean isShouldSeparateByBeads() | ||
{ | ||
return shouldSeparateByBeads; | ||
} | ||
|
||
public void setShouldSeparateByBeads(boolean aShouldSeparateByBeads) | ||
{ | ||
shouldSeparateByBeads = aShouldSeparateByBeads; | ||
} | ||
|
||
@Override | ||
public boolean isAddMoreFormatting() | ||
{ | ||
return addMoreFormatting; | ||
} | ||
|
||
public void setAddMoreFormatting(boolean aAddMoreFormatting) | ||
{ | ||
addMoreFormatting = aAddMoreFormatting; | ||
} | ||
|
||
@Override | ||
public float getIndentThreshold() | ||
{ | ||
return indentThreshold; | ||
} | ||
|
||
public void setIndentThreshold(float aIndentThreshold) | ||
{ | ||
indentThreshold = aIndentThreshold; | ||
} | ||
|
||
@Override | ||
public float getDropThreshold() | ||
{ | ||
return dropThreshold; | ||
} | ||
|
||
public void setDropThreshold(float aDropThreshold) | ||
{ | ||
dropThreshold = aDropThreshold; | ||
} | ||
|
||
@Override | ||
public float getAverageCharTolerance() | ||
{ | ||
return averageCharTolerance; | ||
} | ||
|
||
public void setAverageCharTolerance(float aAverageCharTolerance) | ||
{ | ||
averageCharTolerance = aAverageCharTolerance; | ||
} | ||
|
||
@Override | ||
public float getSpacingTolerance() | ||
{ | ||
return spacingTolerance; | ||
} | ||
|
||
public void setSpacingTolerance(float aSpacingTolerance) | ||
{ | ||
spacingTolerance = aSpacingTolerance; | ||
} | ||
} |
Oops, something went wrong.