Skip to content

Commit

Permalink
release v2.1.6
Browse files Browse the repository at this point in the history
  • Loading branch information
jorisschellekens committed Nov 6, 2022
1 parent 9ac59b6 commit 397f046
Show file tree
Hide file tree
Showing 374 changed files with 359 additions and 94 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
[![Corpus Coverage : 100.0%](https://img.shields.io/badge/corpus%20coverage-100.0%25-green)]()
[![Text Extraction : 93.1%](https://img.shields.io/badge/text%20extraction-93.1%25-green)]()
[![Public Method Documentation : 100%](https://img.shields.io/badge/public%20method%20documentation-100%25-green)]()
[![Number of Tests : 414](https://img.shields.io/badge/number%20of%20tests-414-green)]()
[![Number of Tests : 417](https://img.shields.io/badge/number%20of%20tests-417-green)]()
[![Python : 3.8 | 3.9 | 3.10 ](https://img.shields.io/badge/python-3.8%20|%203.9%20|%203.10-green)]()

[![Downloads](https://pepy.tech/badge/borb)](https://pepy.tech/project/borb)
Expand Down
36 changes: 21 additions & 15 deletions borb/license/anonymous_user_id.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ class AnonymousUserID:
"""

USER_ID_FILE_NAME: str = "anonymous_user_id"
USER_ID: typing.Optional[str] = None

@staticmethod
def _get_borb_installation_dir() -> typing.Optional[Path]:
Expand Down Expand Up @@ -54,12 +55,13 @@ def disable() -> None:
AnonymousUserID._get_borb_installation_dir() is not None
and AnonymousUserID._get_borb_installation_dir().exists()
):
with open(
AnonymousUserID._get_borb_installation_dir()
/ AnonymousUserID.USER_ID_FILE_NAME,
"w",
) as fh:
fh.write("")
try:
# fmt: off
with open(AnonymousUserID._get_borb_installation_dir() / AnonymousUserID.USER_ID_FILE_NAME, "w") as fh:
fh.write("")
# fmt: on
except:
pass

@staticmethod
def enable() -> None:
Expand All @@ -73,7 +75,10 @@ def enable() -> None:
AnonymousUserID._get_user_id_file_from_borb_dir() is not None
and AnonymousUserID._get_user_id_file_from_borb_dir().exists()
):
AnonymousUserID._get_user_id_file_from_borb_dir().unlink()
try:
AnonymousUserID._get_user_id_file_from_borb_dir().unlink()
except:
pass
AnonymousUserID.get()

@staticmethod
Expand All @@ -93,14 +98,15 @@ def get() -> typing.Optional[str]:
or not AnonymousUserID._get_user_id_file_from_borb_dir().exists()
)
):
uuid: str = UUID.get()
with open(
AnonymousUserID._get_borb_installation_dir()
/ AnonymousUserID.USER_ID_FILE_NAME,
"w",
) as fh:
fh.write(uuid)
return uuid
try:
# fmt: off
uuid: str = UUID.get()
with open(AnonymousUserID._get_borb_installation_dir() / AnonymousUserID.USER_ID_FILE_NAME, "w") as fh:
fh.write(uuid)
return uuid
# fmt: on
except:
pass

# IF the borb installation directory exists, and the user_id file exists
# THEN read the user_id file, and return its content
Expand Down
104 changes: 86 additions & 18 deletions borb/license/geo_information.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,22 +16,31 @@ class GeoInformation:
of this system.
"""

_is_retrieving: bool = False
_country_name: typing.Optional[str] = None
_country_code: typing.Optional[str] = None
_city: typing.Optional[str] = None
_country_code: typing.Optional[str] = None
_country_name: typing.Optional[str] = None
_is_retrieving: bool = False
_latitude: typing.Optional[float] = None
_longitude: typing.Optional[float] = None
_state: typing.Optional[str] = None

@staticmethod
def get_country_name() -> typing.Optional[str]:
def get_city() -> typing.Optional[str]:
"""
This function returns the country name (e.g. Belgium)
associated with the IP of this system. If the country name still needs to be determined, None is returned.
:return: the country name
This function returns the city (e.g. Ghent)
associated with the IP of this system. If the city still needs to be determined, None is returned.
:return: the city
"""
if GeoInformation._country_name is None:
if (
GeoInformation._city is None
and GeoInformation._country_code is None
and GeoInformation._country_name is None
and GeoInformation._latitude is None
and GeoInformation._longitude is None
and GeoInformation._state is None
):
threading.Thread(target=GeoInformation._get).start()
return GeoInformation._country_name
return GeoInformation._city

@staticmethod
def get_country_code() -> typing.Optional[str]:
Expand All @@ -40,20 +49,70 @@ def get_country_code() -> typing.Optional[str]:
associated with the IP of this system. If the country code still needs to be determined, None is returned.
:return: the country code
"""
if GeoInformation._country_code is None:
if (
GeoInformation._city is None
and GeoInformation._country_code is None
and GeoInformation._country_name is None
and GeoInformation._latitude is None
and GeoInformation._longitude is None
and GeoInformation._state is None
):
threading.Thread(target=GeoInformation._get).start()
return GeoInformation._country_code

@staticmethod
def get_city() -> typing.Optional[str]:
def get_country_name() -> typing.Optional[str]:
"""
This function returns the city (e.g. Ghent)
associated with the IP of this system. If the city still needs to be determined, None is returned.
:return: the city
This function returns the country name (e.g. Belgium)
associated with the IP of this system. If the country name still needs to be determined, None is returned.
:return: the country name
"""
if GeoInformation._city is None:
if (
GeoInformation._city is None
and GeoInformation._country_code is None
and GeoInformation._country_name is None
and GeoInformation._latitude is None
and GeoInformation._longitude is None
and GeoInformation._state is None
):
threading.Thread(target=GeoInformation._get).start()
return GeoInformation._city
return GeoInformation._country_name

@staticmethod
def get_latitude() -> typing.Optional[float]:
"""
This function returns the latitude (e.g. 51.05)
associated with the IP of this system. If the latitude still needs to be determined, None is returned.
:return: the latitude
"""
if (
GeoInformation._city is None
and GeoInformation._country_code is None
and GeoInformation._country_name is None
and GeoInformation._latitude is None
and GeoInformation._longitude is None
and GeoInformation._state is None
):
threading.Thread(target=GeoInformation._get).start()
return GeoInformation._latitude

@staticmethod
def get_longitude() -> typing.Optional[float]:
"""
This function returns the longitude (e.g. 3.71667)
associated with the IP of this system. If the longitude still needs to be determined, None is returned.
:return: the longitude
"""
if (
GeoInformation._city is None
and GeoInformation._country_code is None
and GeoInformation._country_name is None
and GeoInformation._latitude is None
and GeoInformation._longitude is None
and GeoInformation._state is None
):
threading.Thread(target=GeoInformation._get).start()
return GeoInformation._longitude

@staticmethod
def get_state() -> typing.Optional[str]:
Expand All @@ -62,7 +121,14 @@ def get_state() -> typing.Optional[str]:
associated with the IP of this system. If the state still needs to be determined, None is returned.
:return: the state
"""
if GeoInformation._state is None:
if (
GeoInformation._city is None
and GeoInformation._country_code is None
and GeoInformation._country_name is None
and GeoInformation._latitude is None
and GeoInformation._longitude is None
and GeoInformation._state is None
):
threading.Thread(target=GeoInformation._get).start()
return GeoInformation._state

Expand Down Expand Up @@ -100,9 +166,11 @@ def _get():
GeoInformation._is_retrieving = False
return

GeoInformation._city = location_data.get("city", None)
GeoInformation._country_code = location_data.get("country_code", None)
GeoInformation._country_name = location_data.get("country_name", None)
GeoInformation._city = location_data.get("city", None)
GeoInformation._latitude = location_data.get("latitude", None)
GeoInformation._longitude = location_data.get("longitude", None)
GeoInformation._state = location_data.get("state", None)

# return
Expand Down
2 changes: 2 additions & 0 deletions borb/license/usage_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ def _send_usage_statistics_for_event(event: str) -> None:
"country_code": GeoInformation.get_country_code(),
"country_name": GeoInformation.get_country_name(),
"event": event,
"latitude": GeoInformation.get_latitude(),
"longitude": GeoInformation.get_longitude(),
"state": GeoInformation.get_state(),
"sys_platform": sys.platform,
"utc_time_in_ms": int(datetime.now(timezone.utc).timestamp() * 1000),
Expand Down
2 changes: 1 addition & 1 deletion borb/license/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def get_version() -> str:
This function returns the current borb version
:return: the current borb version
"""
return "2.1.5"
return "2.1.6"

@staticmethod
def get_author() -> str:
Expand Down
1 change: 1 addition & 0 deletions borb/pdf/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@
from .canvas.layout.page_layout.block_flow import BlockFlow

# Shape
from .canvas.line_art.line_art_factory import LineArtFactory
from .canvas.layout.shape.connected_shape import ConnectedShape
from .canvas.layout.shape.disconnected_shape import DisconnectedShape
from .canvas.layout.smart_art.smart_art import SmartArt
Expand Down
13 changes: 9 additions & 4 deletions borb/pdf/canvas/layout/text/paragraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,11 +214,16 @@ def _split_text(self, bounding_box: Rectangle) -> typing.List[str]:
lines_of_text.append(w)
continue

# break the text according to the hyphenation
# fmt: off
if len(lines_of_text[-1]) > 0 and not self._respect_spaces_in_text:
lines_of_text[-1] += " "
lines_of_text[-1] += "".join([x for x in hyphenated_word_parts[0:hyphenation_split_index]]) + "-"
# break the text according to the hyphenation
# IF there is a previous line of text, we can append it to that line
if len(lines_of_text) > 0:
if len(lines_of_text[-1]) > 0 and not self._respect_spaces_in_text:
lines_of_text[-1] += " "
lines_of_text[-1] += "".join([x for x in hyphenated_word_parts[0:hyphenation_split_index]]) + "-"
# ELSE the hyphenated word is added (in parts) to lines_of_text
else:
lines_of_text.append("".join([x for x in hyphenated_word_parts[0:hyphenation_split_index]]) + "-")
lines_of_text.append("".join([x for x in hyphenated_word_parts[hyphenation_split_index:]]))
# fmt: on

Expand Down
12 changes: 8 additions & 4 deletions borb/pdf/page/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ def has_acroforms(self) -> bool:
[
x
for x in self.get("Annots", [])
if "Type" in x
if x is not None
and "Type" in x
and x["Type"] == "Annot"
and "Subtype" in x
and x["Subtype"] == "Widget"
Expand All @@ -85,7 +86,8 @@ def has_form_field(self, field_name: str) -> bool:
[
x
for x in self.get("Annots", [])
if "Type" in x
if x is not None
and "Type" in x
and x["Type"] == "Annot"
and "Subtype" in x
and x["Subtype"] == "Widget"
Expand All @@ -109,7 +111,8 @@ def get_form_field_value(
field_dictionaries: typing.List[Dictionary] = [
x
for x in self.get("Annots", [])
if "Type" in x
if x is not None
and "Type" in x
and x["Type"] == "Annot"
and "Subtype" in x
and x["Subtype"] == "Widget"
Expand All @@ -133,7 +136,8 @@ def set_form_field_value(self, field_name: str, value: str) -> "Page":
field_dictionaries: typing.List[Dictionary] = [
x
for x in self.get("Annots", [])
if "Type" in x
if x is not None
and "Type" in x
and x["Type"] == "Annot"
and "Subtype" in x
and x["Subtype"] == "Widget"
Expand Down
7 changes: 6 additions & 1 deletion borb/toolkit/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,10 +86,14 @@
# text (structure)
from .text.simple_paragraph_extraction import SimpleParagraphExtraction
from .text.simple_line_of_text_extraction import SimpleLineOfTextExtraction
from .text.regular_expression_text_extraction import RegularExpressionTextExtraction
from .text.regular_expression_text_extraction import (
RegularExpressionTextExtraction,
PDFMatch,
)

# text (filter)
from .text.font_color_filter import FontColorFilter
from .text.font_extraction import FontExtraction
from .text.font_name_filter import FontNameFilter

# text (keywords, NLP)
Expand All @@ -98,3 +102,4 @@
except:
pass
from .text.tf_idf_keyword_extraction import TFIDFKeywordExtraction
from .text.stop_words import FRENCH_STOP_WORDS, ENGLISH_STOP_WORDS
34 changes: 12 additions & 22 deletions borb/toolkit/text/text_rank_keyword_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,6 @@
from borb.toolkit.text.simple_text_extraction import SimpleTextExtraction
from borb.toolkit.text.stop_words import ENGLISH_STOP_WORDS

try:
from textblob import TextBlob # type: ignore [import]
except:
assert (
"TextBlob needs to be installed for TextRankKeywordExtraction to work properly."
)


class TextRankKeywordExtraction(SimpleTextExtraction):
"""
Expand Down Expand Up @@ -66,21 +59,20 @@ def _end_page(self, page: Page):
lines = [x for x in re.split("\n*[.?!]+\n*", txt) if len(x) != 0]
for line in lines:

# POS tagging
tags_and_tokens: typing.List[typing.Tuple[str, str]] = TextBlob(line).tags

# select only NOUN, ADJ
toks = [x[0] for x in tags_and_tokens if x[1] in ["NN", "JJ"]]
# split
tokens: typing.List[str] = [
x for x in re.split("[^A-Z]+", line.upper()) if len(x) > 3
]

# build transfer matrix
for i0 in range(0, len(toks)):
w0: str = toks[i0].upper()
for i0 in range(0, len(tokens)):
w0: str = tokens[i0].upper()
if w0 not in mtx:
mtx[w0] = {}
for i1 in range(-3, 3):
if i0 + i1 < 0 or i0 + i1 >= len(toks) or i1 == 0:
if i0 + i1 < 0 or i0 + i1 >= len(tokens) or i1 == 0:
continue
w1: str = toks[i0 + i1].upper()
w1: str = tokens[i0 + i1].upper()
mtx[w0][w1] = mtx[w0].get(w1, 0) + 1

# run eigenvalue algorithm
Expand All @@ -107,12 +99,10 @@ def _end_page(self, page: Page):
eigenvalues_002 = {x: 0 for x, _ in mtx.items()}

# store keywords
self._keywords_per_page[self._current_page] = [
(x, f) for x, f in eigenvalues_001.items()
]
self._keywords_per_page[self._current_page].sort(
key=lambda x: x[1], reverse=True
)
# fmt: off
self._keywords_per_page[self._current_page] = [(x, f) for x, f in eigenvalues_001.items()]
self._keywords_per_page[self._current_page].sort(key=lambda x: x[1], reverse=True)
# fmt: on

def get_keywords_for_page(self, page_number: int) -> typing.List[typing.Any]:
"""
Expand Down
Loading

0 comments on commit 397f046

Please sign in to comment.