Skip to content

Commit

Permalink
Merge pull request #793 from IBM/fix-pdf2parquet-uint64-hash
Browse files Browse the repository at this point in the history
fix uint64 hash to pyarrow
  • Loading branch information
touma-I authored Nov 11, 2024
2 parents 723e675 + 6f5e2cd commit c72e98d
Show file tree
Hide file tree
Showing 15 changed files with 49 additions and 36 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import filetype
import pandas as pd
import pyarrow as pa
import numpy as np
from data_processing.transform import AbstractBinaryTransform, TransformConfiguration
from data_processing.utils import TransformUtils, get_logger, str2bool
from data_processing.utils.cli_utils import CLIArgumentProvider
Expand Down Expand Up @@ -237,7 +238,7 @@ def _convert_pdf2parquet(
num_pages = len(doc.pages)
num_tables = len(doc.tables)
num_doc_elements = len(doc.texts)
document_hash = doc.origin.binary_hash
document_hash = np.uint64(doc.origin.binary_hash)

self._update_metrics(num_pages=num_pages, elapse_time=elapse_time)

Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
"job name": "pdf2parquet",
"job type": "pure python",
"job id": "job_id",
"start_time": "2024-10-29 14:17:59",
"end_time": "2024-10-29 14:18:05",
"start_time": "2024-11-11 21:04:30",
"end_time": "2024-11-11 21:04:38",
"status": "success"
},
"code": {
Expand All @@ -15,6 +15,7 @@
"path": "path"
},
"job_input_params": {
"batch_size": -1,
"artifacts_path": null,
"contents_type": "text/markdown",
"do_table_structure": true,
Expand All @@ -28,23 +29,25 @@
"random_samples": -1,
"files_to_use": [
".pdf",
".docx",
".pptx",
".zip"
],
"num_processors": 0
},
"execution_stats": {
"cpus": 16.8,
"cpus": 21.1,
"gpus": 0,
"memory": 31.22,
"memory": 32.09,
"object_store": 0,
"execution time, min": 0.108
"execution time, min": 0.139
},
"job_output_stats": {
"source_files": 2,
"source_size": 605137,
"result_files": 2,
"result_size": 33044,
"processing_time": 6.478,
"result_size": 32939,
"processing_time": 5.596,
"nrows": 3,
"nsuccess": 3,
"nfail": 0,
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
"job name": "pdf2parquet",
"job type": "pure python",
"job id": "job_id",
"start_time": "2024-10-31 13:14:39",
"end_time": "2024-10-31 13:16:41",
"start_time": "2024-11-11 21:06:08",
"end_time": "2024-11-11 21:06:14",
"status": "success"
},
"code": {
Expand Down Expand Up @@ -36,22 +36,22 @@
"num_processors": 0
},
"execution_stats": {
"cpus": 39.0,
"cpus": 21.5,
"gpus": 0,
"memory": 29.87,
"memory": 32.19,
"object_store": 0,
"execution time, min": 2.029
"execution time, min": 0.1
},
"job_output_stats": {
"source_files": 2,
"source_size": 605137,
"result_files": 1,
"processing_time": 3.888,
"processing_time": 3.353,
"nrows": 3,
"nsuccess": 3,
"nfail": 0,
"nskip": 0,
"result_size": 27200
"result_size": 27147
},
"source": {
"name": "/Users/dol/codes/data-prep-kit/transforms/language/pdf2parquet/python/test-data/input",
Expand Down
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
"job name": "pdf2parquet",
"job type": "pure python",
"job id": "job_id",
"start_time": "2024-10-29 14:20:01",
"end_time": "2024-10-29 14:20:07",
"start_time": "2024-11-11 21:05:31",
"end_time": "2024-11-11 21:05:36",
"status": "success"
},
"code": {
Expand All @@ -15,6 +15,7 @@
"path": "path"
},
"job_input_params": {
"batch_size": -1,
"artifacts_path": null,
"contents_type": "application/json",
"do_table_structure": true,
Expand All @@ -28,23 +29,25 @@
"random_samples": -1,
"files_to_use": [
".pdf",
".docx",
".pptx",
".zip"
],
"num_processors": 0
},
"execution_stats": {
"cpus": 18.0,
"cpus": 21.4,
"gpus": 0,
"memory": 30.77,
"memory": 32.33,
"object_store": 0,
"execution time, min": 0.105
"execution time, min": 0.096
},
"job_output_stats": {
"source_files": 2,
"source_size": 605137,
"result_files": 2,
"result_size": 22953,
"processing_time": 6.282,
"result_size": 22850,
"processing_time": 3.229,
"nrows": 3,
"nsuccess": 3,
"nfail": 0,
Expand Down
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
"job name": "pdf2parquet",
"job type": "pure python",
"job id": "job_id",
"start_time": "2024-10-29 14:19:30",
"end_time": "2024-10-29 14:19:33",
"start_time": "2024-11-11 21:05:04",
"end_time": "2024-11-11 21:05:06",
"status": "success"
},
"code": {
Expand All @@ -15,6 +15,7 @@
"path": "path"
},
"job_input_params": {
"batch_size": -1,
"artifacts_path": null,
"contents_type": "text/markdown",
"do_table_structure": false,
Expand All @@ -28,23 +29,25 @@
"random_samples": -1,
"files_to_use": [
".pdf",
".docx",
".pptx",
".zip"
],
"num_processors": 0
},
"execution_stats": {
"cpus": 17.3,
"cpus": 21.6,
"gpus": 0,
"memory": 28.85,
"memory": 29.57,
"object_store": 0,
"execution time, min": 0.043
"execution time, min": 0.041
},
"job_output_stats": {
"source_files": 2,
"source_size": 605137,
"result_files": 2,
"result_size": 29659,
"processing_time": 2.554,
"result_size": 29555,
"processing_time": 1.997,
"nrows": 3,
"nsuccess": 3,
"nfail": 0,
Expand Down
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
"job name": "pdf2parquet",
"job type": "pure python",
"job id": "job_id",
"start_time": "2024-10-29 14:17:59",
"end_time": "2024-10-29 14:18:05",
"start_time": "2024-11-11 21:04:30",
"end_time": "2024-11-11 21:04:38",
"status": "success"
},
"code": {
Expand All @@ -15,6 +15,7 @@
"path": "path"
},
"job_input_params": {
"batch_size": -1,
"artifacts_path": null,
"contents_type": "text/markdown",
"do_table_structure": true,
Expand All @@ -28,23 +29,25 @@
"random_samples": -1,
"files_to_use": [
".pdf",
".docx",
".pptx",
".zip"
],
"num_processors": 0
},
"execution_stats": {
"cpus": 16.8,
"cpus": 21.1,
"gpus": 0,
"memory": 31.22,
"memory": 32.09,
"object_store": 0,
"execution time, min": 0.108
"execution time, min": 0.139
},
"job_output_stats": {
"source_files": 2,
"source_size": 605137,
"result_files": 2,
"result_size": 33044,
"processing_time": 6.478,
"result_size": 32939,
"processing_time": 5.596,
"nrows": 3,
"nsuccess": 3,
"nfail": 0,
Expand Down
Binary file not shown.

0 comments on commit c72e98d

Please sign in to comment.