diff --git a/README.md b/README.md index 8d3c8210..722f69fe 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ Tabula helps you extract tables from PDFs between 0-100 (inclusive) and preceded by '%', input will be taken as % of actual width of the page. Example: --columns %25,50,80.6 - -f,--format Output format: (CSV,TSV,JSON). Default: CSV + -f,--format Output format: (CSV,TSV,JSON,SJSON). Default: CSV -g,--guess Guess the portion of the page to analyze per page. -h,--help Print this help text. @@ -67,6 +67,7 @@ Tabula helps you extract tables from PDFs -u,--use-line-returns Use embedded line returns in cells. (Only in spreadsheet mode.) -v,--version Print version and exit. + -tn, --tableNames 筛选要输出的表 ``` It also includes a debugging tool, run `java -cp ./target/tabula-1.0.5-jar-with-dependencies.jar technology.tabula.debug.Debug -h` for the available options. diff --git a/pom.xml b/pom.xml index fb1f7e08..3cd1f17e 100644 --- a/pom.xml +++ b/pom.xml @@ -124,26 +124,27 @@ - - org.apache.maven.plugins - maven-gpg-plugin - 1.6 - - - sign-artifacts - verify - - sign - - - - --pinentry-mode - loopback - - - - - + + + + + + + + + + + + + + + + + + + + + maven-compiler-plugin 3.8.1 @@ -159,33 +160,43 @@ technology.tabula.CommandLineApp - - - jar-with-dependencies - + + + jar-with-dependencies + + + + + make-assembly + package + + single + + + + + + org.apache.maven.plugins + maven-surefire-plugin + 2.22.2 + + + -Xms1024m -Xmx2048m + true - - - org.apache.maven.plugins - maven-surefire-plugin - 2.22.2 - - - -Xms1024m -Xmx2048m - - - - - org.apache.maven.plugins - maven-eclipse-plugin - 2.10 - - true - true - - - - + + + + org.apache.maven.plugins + maven-eclipse-plugin + 2.10 + + true + true + + + + @@ -221,6 +232,7 @@ + diff --git a/src/main/java/technology/tabula/CommandLineApp.java b/src/main/java/technology/tabula/CommandLineApp.java index 3a6773a9..ffd5b6f7 100644 --- a/src/main/java/technology/tabula/CommandLineApp.java +++ b/src/main/java/technology/tabula/CommandLineApp.java @@ -7,7 +7,9 @@ import java.io.IOException; import java.util.ArrayList; import java.util.List; +import java.util.Map; +import javafx.scene.control.Tab; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.HelpFormatter; @@ -17,14 +19,12 @@ import org.apache.commons.cli.DefaultParser; import org.apache.pdfbox.pdmodel.PDDocument; +import org.locationtech.jts.util.StringUtil; import technology.tabula.detectors.DetectionAlgorithm; import technology.tabula.detectors.NurminenDetectionAlgorithm; import technology.tabula.extractors.BasicExtractionAlgorithm; import technology.tabula.extractors.SpreadsheetExtractionAlgorithm; -import technology.tabula.writers.CSVWriter; -import technology.tabula.writers.JSONWriter; -import technology.tabula.writers.TSVWriter; -import technology.tabula.writers.Writer; +import technology.tabula.writers.*; public class CommandLineApp { @@ -44,6 +44,8 @@ public class CommandLineApp { private OutputFormat outputFormat; private String password; private TableExtractor tableExtractor; + private Map> tableMap; + public CommandLineApp(Appendable defaultOutput, CommandLine line) throws ParseException { this.defaultOutput = defaultOutput; @@ -51,6 +53,7 @@ public CommandLineApp(Appendable defaultOutput, CommandLine line) throws ParseEx this.pages = CommandLineApp.whichPages(line); this.outputFormat = CommandLineApp.whichOutputFormat(line); this.tableExtractor = CommandLineApp.createExtractor(line); + this.tableMap = CommandLineApp.whichTableMap(line); if (line.hasOption('s')) { this.password = line.getOptionValue('s'); @@ -165,6 +168,10 @@ private void extractFile(File pdfFile, Appendable outFile) throws ParseException while (pageIterator.hasNext()) { Page page = pageIterator.next(); + if (page == null) { + continue; + } + if (tableExtractor.verticalRulingPositions != null) { for (Float verticalRulingPosition : tableExtractor.verticalRulingPositions) { page.addRuling(new Ruling(0, verticalRulingPosition, 0.0f, (float) page.getHeight())); @@ -200,7 +207,8 @@ private void extractFile(File pdfFile, Appendable outFile) throws ParseException } private PageIterator getPageIterator(PDDocument pdfDocument) throws IOException { - ObjectExtractor extractor = new ObjectExtractor(pdfDocument); + ListtableNames=new ArrayList(tableMap.keySet()); + ObjectExtractor extractor = new ObjectExtractor(pdfDocument, tableNames); return (pages == null) ? extractor.extract() : extractor.extract(pages); @@ -249,9 +257,17 @@ private static List> whichAreas(CommandLine line) throw private static List whichPages(CommandLine line) throws ParseException { String pagesOption = line.hasOption('p') ? line.getOptionValue('p') : "1"; + String tableName = line.hasOption("tn") ? line.getOptionValue("tn") : ""; + if (!"".equals(tableName) && "1".equals(pagesOption)) + pagesOption = "all"; return Utils.parsePagesOption(pagesOption); } + private static Map> whichTableMap(CommandLine line) throws ParseException{ + String pagesOption = line.hasOption("tn") ? line.getOptionValue("tn") : ""; + return Utils.parseTableMapOption(pagesOption); + } + private static ExtractionMethod whichExtractionMethod(CommandLine line) { // -r/--spreadsheet [deprecated; use -l] or -l/--lattice if (line.hasOption('r') || line.hasOption('l')) { @@ -363,7 +379,12 @@ public static Options buildOptions() { .hasArg() .argName("PAGES") .build()); - + o.addOption(Option.builder("tn") + .longOpt("tableNames") + .desc("Comma separated list of TableName, or all. Examples: --tableName table1,table2") + .hasArg() + .argName("TABLENAMES") + .build()); return o; } @@ -467,6 +488,9 @@ private void writeTables(List tables, Appendable out) throws IOException case JSON: writer = new JSONWriter(); break; + case SJSON: + writer = new SJSONWriter(tableMap); + break; case TSV: writer = new TSVWriter(); break; @@ -481,6 +505,7 @@ private String getOutputFilename(File pdfFile) { extension = ".csv"; break; case JSON: + case SJSON: extension = ".json"; break; case TSV: @@ -493,7 +518,8 @@ private String getOutputFilename(File pdfFile) { private enum OutputFormat { CSV, TSV, - JSON; + JSON, + SJSON; static String[] formatNames() { OutputFormat[] values = OutputFormat.values(); diff --git a/src/main/java/technology/tabula/ObjectExtractor.java b/src/main/java/technology/tabula/ObjectExtractor.java index 9f3f6a03..2416baf0 100644 --- a/src/main/java/technology/tabula/ObjectExtractor.java +++ b/src/main/java/technology/tabula/ObjectExtractor.java @@ -1,6 +1,7 @@ package technology.tabula; import java.io.IOException; +import java.util.List; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; @@ -9,8 +10,11 @@ public class ObjectExtractor implements java.io.Closeable { private final PDDocument pdfDocument; - public ObjectExtractor(PDDocument pdfDocument) { + private final List tableNames; + + public ObjectExtractor(PDDocument pdfDocument, List tableNames) { this.pdfDocument = pdfDocument; + this.tableNames = tableNames; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // @@ -19,13 +23,21 @@ protected Page extractPage(Integer pageNumber) throws IOException { throw new java.lang.IndexOutOfBoundsException("Page number does not exist."); } PDPage page = pdfDocument.getPage(pageNumber - 1); - + ObjectExtractorStreamEngine streamEngine = new ObjectExtractorStreamEngine(page); streamEngine.processPage(page); TextStripper textStripper = new TextStripper(pdfDocument, pageNumber); textStripper.process(); - + String tableName = ""; + //TODO 判断表名是否存在 + if (tableNames != null){ + //采用文本包含方式判断表名,后续需优化 + tableName = Utils.findTableName(tableNames, pdfTextStripper.getContent()); + if ("".equals(tableName)) { + return null; + } + } Utils.sort(textStripper.getTextElements(), Rectangle.ILL_DEFINED_ORDER); float width, height; @@ -49,6 +61,8 @@ protected Page extractPage(Integer pageNumber) throws IOException { .withMinCharWidth(textStripper.getMinCharWidth()) .withMinCharHeight(textStripper.getMinCharHeight()) .withIndex(textStripper.getSpatialIndex()) + .withContent(pdfTextStripper.getContent()) + .withTableName(tableName) .build(); } diff --git a/src/main/java/technology/tabula/Page.java b/src/main/java/technology/tabula/Page.java index ed74d14a..6d24400d 100644 --- a/src/main/java/technology/tabula/Page.java +++ b/src/main/java/technology/tabula/Page.java @@ -15,13 +15,16 @@ @SuppressWarnings("serial") // TODO: this class should probably be called "PageArea" or something like that public class Page extends Rectangle { - + private int number; private Integer rotation; private float minCharWidth; private float minCharHeight; private List textElements; + + private String content; + private String tableName; // TODO: Create a class for 'List ' that encapsulates all of these lists and their behaviors? private List rulings, @@ -47,6 +50,8 @@ private Page( float minCharWidth, float minCharHeight, RectangleSpatialIndex index + String content, + String tableName ) { super(pageDims.getTop(), pageDims.getLeft(), pageDims.getWidth(), pageDims.getHeight()); this.rotation = rotation; @@ -58,6 +63,8 @@ private Page( this.minCharWidth = minCharWidth; this.minCharHeight = minCharHeight; this.spatialIndex = index; + this.content = content; + this.tableName = tableName; } /** @@ -342,6 +349,8 @@ public static class Builder { private float minCharWidth; private float minCharHeight; private RectangleSpatialIndex index; + private String content; + private String tableName; private Builder() {} @@ -361,9 +370,21 @@ public Builder withRotation(int rotation) { return this; } + public Builder withContent(String content) { + this.content = content; + + return this; + } + + public Builder withTableName(String tableName) { + this.tableName = tableName; + return this; + } + public Builder withNumber(int number) { this.number = number; + return this; } @@ -410,7 +431,7 @@ public Builder withIndex(RectangleSpatialIndex index) { } public Page build() { - return new Page(pageDims, rotation, number, pdPage, pdDocument, textElements, rulings, minCharWidth, minCharHeight, index); + return new Page(pageDims, rotation, number, pdPage, pdDocument, textElements, rulings, minCharWidth, minCharHeight, index, content, tableName); } } } diff --git a/src/main/java/technology/tabula/Table.java b/src/main/java/technology/tabula/Table.java index 1e73bedf..d7178475 100644 --- a/src/main/java/technology/tabula/Table.java +++ b/src/main/java/technology/tabula/Table.java @@ -23,6 +23,7 @@ public Table(ExtractionAlgorithm extractionAlgorithm) { private int rowCount = 0; private int colCount = 0; + private String tableName; private int pageNumber = 0; /* visible for testing */ final TreeMap cells = new TreeMap<>(); @@ -32,6 +33,14 @@ public Table(ExtractionAlgorithm extractionAlgorithm) { public int getPageNumber() { return pageNumber; } public void setPageNumber(int pageNumber) { this.pageNumber = pageNumber; } + public String getTableName() { + return tableName; + } + + public void setTableName(String tableName) { + this.tableName = tableName; + } + public String getExtractionMethod() { return extractionMethod; } public void add(RectangularTextContainer chunk, int row, int col) { diff --git a/src/main/java/technology/tabula/TextStripper.java b/src/main/java/technology/tabula/TextStripper.java index 557fa439..f5020d71 100644 --- a/src/main/java/technology/tabula/TextStripper.java +++ b/src/main/java/technology/tabula/TextStripper.java @@ -18,13 +18,19 @@ public class TextStripper extends PDFTextStripper { private static final float AVG_HEIGHT_MULT_THRESHOLD = 6.0f; private static final float MAX_BLANK_FONT_SIZE = 40.0f; private static final float MIN_BLANK_FONT_SIZE = 2.0f; - private final PDDocument document; - private final ArrayList textElements; - private final RectangleSpatialIndex spatialIndex; - private float minCharWidth = Float.MAX_VALUE; - private float minCharHeight = Float.MAX_VALUE; - private float totalHeight = 0.0f; - private int countHeight = 0; + private PDDocument document; + public ArrayList textElements; + public RectangleSpatialIndex spatialIndex; + public float minCharWidth = Float.MAX_VALUE; + public float minCharHeight = Float.MAX_VALUE; + public float totalHeight = 0.0f; + public int countHeight = 0; + private String content; + + public String getContent() { + return content; + } + public TextStripper(PDDocument document, int pageNumber) throws IOException { super(); @@ -36,12 +42,13 @@ public TextStripper(PDDocument document, int pageNumber) throws IOException { } public void process() throws IOException { - this.getText(this.document); + content = this.getText(this.document); } @Override protected void writeString(String string, List textPositions) throws IOException { + super.writeString(string, textPositions); for (TextPosition textPosition: textPositions) { if (textPosition == null) { diff --git a/src/main/java/technology/tabula/Utils.java b/src/main/java/technology/tabula/Utils.java index 00814429..66b5ef00 100644 --- a/src/main/java/technology/tabula/Utils.java +++ b/src/main/java/technology/tabula/Utils.java @@ -8,7 +8,10 @@ import java.io.IOException; import java.math.BigDecimal; import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import javafx.scene.control.Tab; import org.apache.commons.cli.ParseException; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; @@ -58,7 +61,6 @@ public static Rectangle bounds(Collection shapes) { } return rv; - } // range iterator @@ -117,6 +119,28 @@ public static List> transpose(List> table) { return ret; } + public static Table maxColTable(List tables){ + int colCount = 0; + Table table = null; + for (int i = 0; i< tables.size();i++) { + Table t = tables.get(i); + if (t.getColCount() > colCount || i == 0){ + colCount = t.getColCount(); + table = t; + } + } + return table; + } + + public static boolean isEmptyRow(List rows){ + for(String item: rows){ + if (item != null && !"".equals(item)){ + return false; + } + } + return true; + } + /** * Wrap Collections.sort so we can fallback to a non-stable quicksort if we're * running on JDK7+ @@ -187,6 +211,63 @@ public static List parsePagesOption(String pagesSpec) throws ParseExcep return rv; } + public static Map> parseTableMapOption(String tableNamesSpec) throws ParseException { + if (tableNamesSpec.equals("")) { + return null; + } + Map> rv = new HashMap<>(); + String[] ranges = tableNamesSpec.split(","); + for (int i = 0; i < ranges.length; i++) { + List cols = new ArrayList<>(); + //解析表名和列 + String[] tns = ranges[i].split("\\["); + if ("".equals(tns[0])) + continue; + String tableName = tns[0]; + rv.put(tableName,cols); + List colGroup= findContentByRegex(ranges[i], "\\\\[(.*?)]"); + for(String str: colGroup){ + String[] cns = str.split("|"); + for(String item: cns) cols.add(item); + } + } + return rv; + } + + public static List findContentByRegex(String content, String regex){ + Pattern pattern = Pattern.compile(regex); + Matcher matcher = pattern.matcher(content); + List lstStr = new ArrayList<>(); + while (matcher.find()) { + lstStr.add(matcher.group(1)); + } + return lstStr; + } + + public static boolean isMatch(String content, String regex){ + Pattern pattern = Pattern.compile(regex); + Matcher matcher = pattern.matcher(content); + return matcher.matches(); + } + + public static boolean containTable(List tableNames, String content){ + for (String tableName: tableNames) { + if (content.contains(tableName)){ + return true; + } + } + return false; + } + + public static String findTableName(List tableNames, String content){ + for (String tableName: tableNames) { + if (content.contains(tableName)){ + return tableName; + } + } + return ""; + } + public static void snapPoints(List rulings, float xThreshold, float yThreshold) { // collect points and keep a Line -> p1,p2 map diff --git a/src/main/java/technology/tabula/debug/Debug.java b/src/main/java/technology/tabula/debug/Debug.java index 91609045..0464515d 100644 --- a/src/main/java/technology/tabula/debug/Debug.java +++ b/src/main/java/technology/tabula/debug/Debug.java @@ -217,7 +217,7 @@ public static void renderPage(String pdfPath, String outPath, int pageNumber, Re boolean drawDetectedTables) throws IOException { PDDocument document = PDDocument.load(new File(pdfPath)); - ObjectExtractor oe = new ObjectExtractor(document); + ObjectExtractor oe = new ObjectExtractor(document, null); Page page = oe.extract(pageNumber + 1); diff --git a/src/main/java/technology/tabula/extractors/BasicExtractionAlgorithm.java b/src/main/java/technology/tabula/extractors/BasicExtractionAlgorithm.java index ed2e78e3..f593cb3b 100644 --- a/src/main/java/technology/tabula/extractors/BasicExtractionAlgorithm.java +++ b/src/main/java/technology/tabula/extractors/BasicExtractionAlgorithm.java @@ -65,6 +65,7 @@ public int compare(Ruling arg0, Ruling arg1) { Table table = new Table(this); table.setRect(page.getLeft(), page.getTop(), page.getWidth(), page.getHeight()); + table.setTableName(page.getTableName()); table.setPageNumber(page.getPageNumber()); for (int i = 0; i < lines.size(); i++) { diff --git a/src/main/java/technology/tabula/extractors/SpreadsheetExtractionAlgorithm.java b/src/main/java/technology/tabula/extractors/SpreadsheetExtractionAlgorithm.java index 5b4af3d5..915f7536 100644 --- a/src/main/java/technology/tabula/extractors/SpreadsheetExtractionAlgorithm.java +++ b/src/main/java/technology/tabula/extractors/SpreadsheetExtractionAlgorithm.java @@ -89,6 +89,7 @@ else if (r.vertical()) { } TableWithRulingLines t = new TableWithRulingLines(area, overlappingCells, horizontalOverlappingRulings, verticalOverlappingRulings, this, page.getPageNumber()); + t.setTableName(page.getTableName()); spreadsheets.add(t); } Utils.sort(spreadsheets, Rectangle.ILL_DEFINED_ORDER); @@ -111,7 +112,8 @@ public boolean isTabular(Page page) { if (tables.isEmpty()) { return false; } - Table table = tables.get(0); + //Table table = tables.get(0); + Table table = Utils.maxColTable(tables); int rowsDefinedByLines = table.getRowCount(); int colsDefinedByLines = table.getColCount(); diff --git a/src/main/java/technology/tabula/outobjects/OutTable.java b/src/main/java/technology/tabula/outobjects/OutTable.java new file mode 100644 index 00000000..cd2fd42a --- /dev/null +++ b/src/main/java/technology/tabula/outobjects/OutTable.java @@ -0,0 +1,33 @@ +package technology.tabula.outobjects; + +import java.util.List; + +public class OutTable { + private String name; + private List column; + private List> data; + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public List getColumn() { + return column; + } + + public void setColumn(List column) { + this.column = column; + } + + public List> getData() { + return data; + } + + public void setData(List> data) { + this.data = data; + } +} diff --git a/src/main/java/technology/tabula/writers/SJSONWriter.java b/src/main/java/technology/tabula/writers/SJSONWriter.java new file mode 100644 index 00000000..72e89989 --- /dev/null +++ b/src/main/java/technology/tabula/writers/SJSONWriter.java @@ -0,0 +1,99 @@ +package technology.tabula.writers; + +import com.google.gson.*; +import technology.tabula.*; +import technology.tabula.json.RectangularTextContainerSerializer; +import technology.tabula.json.TableSerializer; +import technology.tabula.outobjects.OutTable; + +import java.io.IOException; +import java.lang.reflect.Modifier; +import java.util.*; + +public class SJSONWriter implements Writer { + + private Map> tableMap; + private static final ExclusionStrategy ALLCLASSES_SKIPNONPUBLIC = new ExclusionStrategy() { + @Override public boolean shouldSkipClass(Class c) { return false; } + @Override public boolean shouldSkipField(FieldAttributes fa) { return !fa.hasModifier(Modifier.PUBLIC); } + }; + + public SJSONWriter(Map> tableMap){ + this.tableMap = tableMap; + } + + @Override + public void write(Appendable out, Table table) throws IOException { + write(out, Collections.singletonList(table)); + } + + @Override + public void write(Appendable out, List
tables) throws IOException { + Gson gson = gson(); + Map outTableMap = new HashMap<>(); + OutTable outTable = null; + for (Table table : tables) { + if (table.getRowCount() > 0){ + String tableName = table.getTableName(); + if (outTableMap.containsKey(tableName)) { + outTable = outTableMap.get(tableName); + } + else { + outTable = new OutTable(); + outTable.setName(tableName); + outTable.setColumn(new ArrayList<>()); + outTable.setData(new ArrayList<>()); + outTableMap.put(tableName, outTable); + } + int dataRow = 0; + //查找列的位置及数据开始位置 + /* + Map colPos = null; + if (tableMap != null) { + List cols = tableMap.get(tableName); + for(String item: cols){ + colPos.put(item, null); + } + } + */ + + /* + if (colPos != null) { + for (int i = 0; i < table.getRows().size(); i++) { + List row = table.getRows().get(i); + for (int j = 0; j< row.size(); j++) { + RectangularTextContainer tc = row.get(j); + for (String key : colPos.keySet()) { + if (colPos.get(key) != null && Utils.isMatch(tc.getText(),key)){ + outTable.getColumn().add(tc.getText()); + colPos.put(key, Integer.valueOf(j)); + dataRow = i; + } + } + } + } + } + */ + for(int i = dataRow; i< table.getRows().size(); i++){ + List row = table.getRows().get(i); + List cells = new ArrayList<>(row.size()); + for (RectangularTextContainer tc : row) { + cells.add(tc.getText()); + } + if (!Utils.isEmptyRow(cells)) outTable.getData().add(cells); + } + } + } + JsonArray array = new JsonArray(); + for (Map.Entry m : outTableMap.entrySet()) { + array.add(gson.toJsonTree(m.getValue(), OutTable.class)); + } + out.append(gson.toJson(array)); + } + + private static Gson gson() { + return new GsonBuilder() + .create(); + } + +} diff --git a/src/test/java/technology/tabula/TestObjectExtractor.java b/src/test/java/technology/tabula/TestObjectExtractor.java index 9db7ad18..b36d1539 100644 --- a/src/test/java/technology/tabula/TestObjectExtractor.java +++ b/src/test/java/technology/tabula/TestObjectExtractor.java @@ -61,7 +61,6 @@ public void testTextExtractionDoesNotRaise() throws IOException { PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/rotated_page.pdf")); try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) { PageIterator pi = oe.extract(); - assertTrue(pi.hasNext()); assertNotNull(pi.next()); assertFalse(pi.hasNext()); @@ -73,7 +72,6 @@ public void testShouldDetectRulings() throws IOException { PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/should_detect_rulings.pdf")); try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) { PageIterator pi = oe.extract(); - Page page = pi.next(); List rulings = page.getRulings(); @@ -86,7 +84,6 @@ public void testShouldDetectRulings() throws IOException { @Test public void testDontThrowNPEInShfill() throws IOException { PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/labor.pdf")); - try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) { PageIterator pi = oe.extract(); assertTrue(pi.hasNext()); @@ -106,7 +103,6 @@ public void testExtractOnePage() throws IOException { try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) { Page page = oe.extract(2); - assertNotNull(page); } @@ -116,7 +112,6 @@ public void testExtractOnePage() throws IOException { public void testExtractWrongPageNumber() throws IOException { PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/S2MNCEbirdisland.pdf")); assertEquals(2, pdf_document.getNumberOfPages()); - try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) { oe.extract(3); } @@ -125,7 +120,6 @@ public void testExtractWrongPageNumber() throws IOException { @Test public void testTextElementsContainedInPage() throws IOException { PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/cs-en-us-pbms.pdf")); - try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) { Page page = oe.extractPage(1); @@ -138,7 +132,6 @@ public void testTextElementsContainedInPage() throws IOException { @Test public void testDoNotNPEInPointComparator() throws IOException { PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/npe_issue_206.pdf")); - try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) { Page p = oe.extractPage(1); assertNotNull(p); diff --git a/src/test/java/technology/tabula/TestTableDetection.java b/src/test/java/technology/tabula/TestTableDetection.java index 6e58f6a4..3efdf2d8 100644 --- a/src/test/java/technology/tabula/TestTableDetection.java +++ b/src/test/java/technology/tabula/TestTableDetection.java @@ -163,7 +163,7 @@ public void testDetectionOfTables() throws Exception { // tabula extractors PDDocument pdfDocument = PDDocument.load(this.pdf); - ObjectExtractor extractor = new ObjectExtractor(pdfDocument); + ObjectExtractor extractor = new ObjectExtractor(pdfDocument, null); // parse expected tables from the ground truth dataset Map> expectedTables = new HashMap<>(); diff --git a/src/test/java/technology/tabula/UtilsForTesting.java b/src/test/java/technology/tabula/UtilsForTesting.java index 3ee8efde..6e82b4b5 100644 --- a/src/test/java/technology/tabula/UtilsForTesting.java +++ b/src/test/java/technology/tabula/UtilsForTesting.java @@ -25,7 +25,7 @@ public static Page getPage(String path, int pageNumber) throws IOException { try { PDDocument document = PDDocument .load(new File(path)); - oe = new ObjectExtractor(document); + oe = new ObjectExtractor(document, null); Page page = oe.extract(pageNumber); return page; } finally {