Skip to content

Commit

Permalink
Merge pull request #5238 from inception-project/feature/5237-Allow-tu…
Browse files Browse the repository at this point in the history
…ning-PDF-import-parameters-via-settings

#5237 - Allow tuning PDF import parameters via settings
  • Loading branch information
reckart authored Jan 21, 2025
2 parents 710f561 + f57e150 commit f854a78
Show file tree
Hide file tree
Showing 14 changed files with 554 additions and 76 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -27,69 +27,68 @@ The maximum number of results can also be configured separately for each knowled
.Hard max results
A hard limit for the *Max results* parameter.

If no value for the parameter is specified, its default value is used. The default value is shown as
an example of how the parameter can be configured below:
If no value for the parameter is specified, its default value is used.
The default value is shown as an example of how the parameter can be configured below:

.Knowledge base settings overview
.Knowledge base settings
[cols="4*", options="header"]
|===
| Setting
| Description
| Default
| Example

| knowledge-base.enabled
| `knowledge-base.enabled`
| enable/disable KB support
| true
| false
| `true`
| `false`

| knowledge-base.default-max-results
| `knowledge-base.default-max-results`
| default result limit for SPARQL query
| 1000
| 10000
| `1000`
| `10000`

| knowledge-base.hard-max-results
| `knowledge-base.hard-max-results`
| hard limit for the maximum number of results from a query
| 10000
| 5000
| `10000`
| `5000`

| knowledge-base.cache-size
| `knowledge-base.cache-size`
| number of items (classes, instances and properties) to cache
| 100000
| 500000
| `100000`
| `500000`

| knowledge-base.cache-expire-delay
| `knowledge-base.cache-expire-delay`
| time before items are expunged from the cache
| 15m
| 1h
| `15m`
| `1h`

| knowledge-base.cache-refresh-delay
| `knowledge-base.cache-refresh-delay`
| time before items are asynchronously refreshed
| 5m
| 30m
| `5m`
| `30m`

| knowledge-base.render-cache-size
| `knowledge-base.render-cache-size`
| number of items (classes, instances and properties) to cache during rendering
| 10000
| 50000
| `10000`
| `50000`

| knowledge-base.render-cache-expire-delay
| `knowledge-base.render-cache-expire-delay`
| time before items are expunged from the render cache
| 10m
| 1h
| `10m`
| `1h`

| knowledge-base.render-cache-refresh-delay
| `knowledge-base.render-cache-refresh-delay`
| time before items are asynchronously refreshed when rendering
| 1m
| 5m
| `1m`
| `5m`

| knowledge-base.remove-orphans-on-start
| `knowledge-base.remove-orphans-on-start`
| whether to delete orphaned KBs on start
| false
| true
| `false`
| `true`
|===

NOTE: Disabling the knowledge base support will lead to the loss of concept linked features from
documents/projects that were using them. If you wish to run the application without knowledge base
support, it is strongly recommended to disable the feature immediately after the installation and
not after any projects have potentially started using it.
documents/projects that were using them.
If you wish to run the application without knowledge base support, it is strongly recommended to disable the feature immediately after the installation and not after any projects have potentially started using it.
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ NOTE: Disabling document metadata support prevents new document metadata layers
created, but it does not prevent the use of existing document metadata layers layers in order
not to break existing projects.

.Knowledge base settings overview
.Document metadata settings
[cols="4*", options="header"]
|===
| Setting
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ Support for this feature will be removed in a future version. The replacement is

This section describes the global settings related to the legacy PDF annotation editor module.

.Knowledge base settings overview
.Legacy PDF editor settings
[cols="4*", options="header"]
|===
| Setting
Expand Down
4 changes: 4 additions & 0 deletions inception/inception-pdf-editor2/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,10 @@
<groupId>org.springframework</groupId>
<artifactId>spring-webmvc</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-autoconfigure</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
package de.tudarmstadt.ukp.inception.pdfeditor2.config;

import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.boot.context.properties.EnableConfigurationProperties;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;

Expand All @@ -29,6 +30,7 @@
* Provides support for an PDF-oriented annotation editor.
*/
@Configuration
@EnableConfigurationProperties(PdfFormatPropertiesImpl.class)
public class PdfAnnotationEditor2SupportAutoConfiguration
{
@ConditionalOnProperty(prefix = "ui.pdf", name = "enabled", havingValue = "true", matchIfMissing = true)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
/*
* Licensed to the Technische Universität Darmstadt under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The Technische Universität Darmstadt
* licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.inception.pdfeditor2.config;

import de.tudarmstadt.ukp.inception.pdfeditor2.visual.VisualPDFTextStripper;

public interface PdfFormatProperties
{
/**
* @return Whether to sort the text as it appears on screen or leave it as it appears in the PDF
* file. This may help with PDFs generated by writers that output text sorted by style.
* However, it may cause problems with other types of PDFs, e.g. such that contain
* watermark text in the background.
*
* @see VisualPDFTextStripper#setSortByPosition(boolean)
*/
boolean isSortByPosition();

/**
* @return Whether to suppress duplicate overlapping text. By default the text stripper will
* attempt to remove text that overlapps each other. Word paints the same character
* several times in order to make it look bold. By setting this to false all text will
* be extracted, which means that certain sections will be duplicated, but better
* performance will be noticed.
*
* @see VisualPDFTextStripper#setSuppressDuplicateOverlappingText(boolean)
*/
boolean isSuppressDuplicateOverlappingText();

/**
* @return whether the text stripper should group the text output by a list of beads.
*
* @see VisualPDFTextStripper#setShouldSeparateByBeads(boolean)
*/
boolean isShouldSeparateByBeads();

/**
* @return whether There will some additional text formatting be added.
*
* @see VisualPDFTextStripper#setAddMoreFormatting(boolean)
*/
boolean isAddMoreFormatting();

/**
* @return the multiple of whitespace character widths for the current text which the current
* line start can be indented from the previous line start beyond which the current line
* start is considered to be a paragraph start.
*
* @see VisualPDFTextStripper#setIndentThreshold(float)
*/
float getIndentThreshold();

/**
* @return the minimum whitespace, as a multiple of the max height of the current characters
* beyond which the current line start is considered to be a paragraph start.
*
* @see VisualPDFTextStripper#setDropThreshold(float)
*/
float getDropThreshold();

/**
* @return the character width-based tolerance value that is used to estimate where spaces in
* text should be added. Note that the default value for this has been determined from
* trial and error. Setting this value larger will reduce the number of spaces added.
*
* @see VisualPDFTextStripper#setAverageCharTolerance(float)
*/
float getAverageCharTolerance();

/**
* @return the space width-based tolerance value that is used to estimate where spaces in text
* should be added. Note that the default value for this has been determined from trial
* and error. Setting this value larger will reduce the number of spaces added.
*
* @see VisualPDFTextStripper#setSpacingTolerance(float)
*/
float getSpacingTolerance();

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
/*
* Licensed to the Technische Universität Darmstadt under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The Technische Universität Darmstadt
* licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.inception.pdfeditor2.config;

import org.springframework.boot.context.properties.ConfigurationProperties;

@ConfigurationProperties("format.pdf")
public class PdfFormatPropertiesImpl
implements PdfFormatProperties
{
private boolean sortByPosition = false;
private boolean suppressDuplicateOverlappingText = true;
private boolean shouldSeparateByBeads = true;

private boolean addMoreFormatting = true;
private float indentThreshold = 2.0f;
private float dropThreshold = 2.5f;

private float averageCharTolerance = 0.3f;
private float spacingTolerance = 0.5f;

@Override
public boolean isSortByPosition()
{
return sortByPosition;
}

public void setSortByPosition(boolean aSortByPosition)
{
sortByPosition = aSortByPosition;
}

@Override
public boolean isSuppressDuplicateOverlappingText()
{
return suppressDuplicateOverlappingText;
}

public void setSuppressDuplicateOverlappingText(boolean aSuppressDuplicateOverlappingText)
{
suppressDuplicateOverlappingText = aSuppressDuplicateOverlappingText;
}

@Override
public boolean isShouldSeparateByBeads()
{
return shouldSeparateByBeads;
}

public void setShouldSeparateByBeads(boolean aShouldSeparateByBeads)
{
shouldSeparateByBeads = aShouldSeparateByBeads;
}

@Override
public boolean isAddMoreFormatting()
{
return addMoreFormatting;
}

public void setAddMoreFormatting(boolean aAddMoreFormatting)
{
addMoreFormatting = aAddMoreFormatting;
}

@Override
public float getIndentThreshold()
{
return indentThreshold;
}

public void setIndentThreshold(float aIndentThreshold)
{
indentThreshold = aIndentThreshold;
}

@Override
public float getDropThreshold()
{
return dropThreshold;
}

public void setDropThreshold(float aDropThreshold)
{
dropThreshold = aDropThreshold;
}

@Override
public float getAverageCharTolerance()
{
return averageCharTolerance;
}

public void setAverageCharTolerance(float aAverageCharTolerance)
{
averageCharTolerance = aAverageCharTolerance;
}

@Override
public float getSpacingTolerance()
{
return spacingTolerance;
}

public void setSpacingTolerance(float aSpacingTolerance)
{
spacingTolerance = aSpacingTolerance;
}
}
Loading

0 comments on commit f854a78

Please sign in to comment.