From 81493f7521b8238f1926be497c0d7816c2c77ae6 Mon Sep 17 00:00:00 2001 From: Victor Garcia Date: Fri, 25 Aug 2023 09:21:01 +0200 Subject: [PATCH 1/7] feat: added control of datetimes and included tests --- kindle2notion/exporting.py | 39 +++++++++++++++++++++++--------------- setup.py | 2 +- tests/test_exporting.py | 36 +++++++++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+), 16 deletions(-) diff --git a/kindle2notion/exporting.py b/kindle2notion/exporting.py index 25bee93..19fc910 100644 --- a/kindle2notion/exporting.py +++ b/kindle2notion/exporting.py @@ -16,11 +16,11 @@ def export_to_notion( - all_books: Dict, - enable_highlight_date: bool, - enable_book_cover: bool, - notion_api_auth_token: str, - notion_database_id: str, + all_books: Dict, + enable_highlight_date: bool, + enable_book_cover: bool, + notion_api_auth_token: str, + notion_database_id: str, ) -> None: print("Initiating transfer...\n") @@ -48,7 +48,7 @@ def export_to_notion( def _prepare_aggregated_text_for_one_book( - clippings: List, enable_highlight_date: bool + clippings: List, enable_highlight_date: bool ) -> Tuple[str, str]: # TODO: Special case for books with len(clippings) >= 100 characters. Character limit in a Paragraph block in Notion is 100 formatted_clippings = [] @@ -77,17 +77,17 @@ def _prepare_aggregated_text_for_one_book( def _add_book_to_notion( - title: str, - author: str, - clippings_count: int, - formatted_clippings: list, - last_date: str, - notion_api_auth_token: str, - notion_database_id: str, - enable_book_cover: bool, + title: str, + author: str, + clippings_count: int, + formatted_clippings: list, + last_date_string: str, + notion_api_auth_token: str, + notion_database_id: str, + enable_book_cover: bool, ): notion = notional.connect(auth=notion_api_auth_token) - last_date = datetime.strptime(last_date, "%A, %d %B %Y %I:%M:%S %p") + last_date = get_last_date_from_string(last_date_string) # Condition variables title_exists = False @@ -174,6 +174,15 @@ def _add_book_to_notion( return message +def get_last_date_from_string(last_date_string: str) -> datetime: + if not last_date_string: + return datetime.now() + try: + return datetime.strptime(last_date_string, "%A, %d %B %Y %I:%M:%S %p") + except ValueError: + # Datetime format is not English, retrying with non AM-PM format + return datetime.strptime(last_date_string, "%A, %d %B %Y %I:%M:%S") + # def _create_rich_text_object(text): # if "Note: " in text: # # Bold text diff --git a/setup.py b/setup.py index 22a4545..d511524 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ setup( name="kindle2notion", - version="1.0.1", + version="1.0.2", author="Jeffrey Jacob", author_email="jeffreysamjacob@gmail.com", description="Export all the clippings from your Kindle device to a database in Notion.", diff --git a/tests/test_exporting.py b/tests/test_exporting.py index 6b4a995..6ecfd94 100644 --- a/tests/test_exporting.py +++ b/tests/test_exporting.py @@ -71,3 +71,39 @@ def test_prepare_aggregated_text_for_one_book_should_return_the_aggregated_text_ print(actual) # Then assert expected == actual + + +def test_when_date_is_not_ampm_format_then_aggregated_text_should_return_date(): + # Given + highlights = [ + ( + "This is an example highlight.", + "1", + "100", + "jueves, 24 de agosto de 2023 7:28:38", + False, + ), + ( + "This is a second example highlight.", + "2", + "200", + "viernes, 25 de agosto de 2023 7:28:38", + True, + ), + ] + + expected = ( + [ + "This is an example highlight.\n* Page: 1, Location: 100, Date Added: jueves, 24 de agosto de 2023 7:28:38\n\n", + "> NOTE: \nThis is a second example highlight.\n* Page: 2, Location: 200, Date Added: viernes, 25 de agosto de 2023 7:28:38\n\n", + ], + "viernes, 25 de agosto de 2023 7:28:38", + ) + + # When + actual = _prepare_aggregated_text_for_one_book( + highlights, enable_highlight_date=True + ) + print(actual) + # Then + assert expected == actual From 6d1904ee50aacf34bf01f96b1699fe26aab69b01 Mon Sep 17 00:00:00 2001 From: Victor Garcia Date: Fri, 25 Aug 2023 09:29:26 +0200 Subject: [PATCH 2/7] feat: added todo's for future development --- kindle2notion/parsing.py | 4 +++- tests/test_exporting.py | 6 +++++- tests/test_parsing.py | 17 +++++++++++++++++ 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/kindle2notion/parsing.py b/kindle2notion/parsing.py index dea4435..a871810 100644 --- a/kindle2notion/parsing.py +++ b/kindle2notion/parsing.py @@ -126,7 +126,8 @@ def _parse_page_location_date_and_note( second_line_as_list = second_line.strip().split(" | ") page = location = date = "" is_note = False - + # TODO: Handle multi-language through: element in Pages.ENUM.values + # one enum for each of notes, location, pages, added on that contains language versions for element in second_line_as_list: element = element.lower() if "note" in element: @@ -141,6 +142,7 @@ def _parse_page_location_date_and_note( date = parse( element[element.find("added on") :].replace("added on", "").strip() ) + # TODO: Handle different format date = date.strftime("%A, %d %B %Y %I:%M:%S %p") return page, location, date, is_note diff --git a/tests/test_exporting.py b/tests/test_exporting.py index 6ecfd94..c190dc5 100644 --- a/tests/test_exporting.py +++ b/tests/test_exporting.py @@ -73,7 +73,7 @@ def test_prepare_aggregated_text_for_one_book_should_return_the_aggregated_text_ assert expected == actual -def test_when_date_is_not_ampm_format_then_aggregated_text_should_return_date(): +def test_when_date_is_not_ampm_format_then_aggregated_text_should_return_appropiate_date(): # Given highlights = [ ( @@ -107,3 +107,7 @@ def test_when_date_is_not_ampm_format_then_aggregated_text_should_return_date(): print(actual) # Then assert expected == actual + + +def test_when_date_is_not_ampm_format_then_aggregated_text_should_not_give_valueerror(): + pass \ No newline at end of file diff --git a/tests/test_parsing.py b/tests/test_parsing.py index 26e0be0..536d3ec 100644 --- a/tests/test_parsing.py +++ b/tests/test_parsing.py @@ -333,3 +333,20 @@ def test_add_parsed_items_to_books_dict_should_add_the_parsed_items_when_the_boo # Then assert expected == actual + +def test_parse_date_when_format_does_not_include_am_pm(): + # Given + raw_clipping_list = [ + "Relativity (Einstein, Albert)", + "- La subrayado en la posición 558-560 | Añadido el viernes, 25 de agosto de 2023 7:28:38", + "", + "This is a test highlight.", + False, + ] + expected = ("3", "", "Friday, 30 April 2021 12:31:29 AM", False) + + # When + actual = _parse_page_location_date_and_note(raw_clipping_list) + + # Then + assert expected == actual From ecda5805223fa9900a1b31b8f404762f78964282 Mon Sep 17 00:00:00 2001 From: Victor Garcia Date: Fri, 25 Aug 2023 18:02:28 +0200 Subject: [PATCH 3/7] feat: implemented enums to support languages --- kindle2notion/parsing.py | 39 +++++++++++++++++---------- kindle2notion/parsing/WordDetector.py | 24 +++++++++++++++++ kindle2notion/parsing/__init__.py | 0 kindle2notion/parsing/enums.py | 38 ++++++++++++++++++++++++++ 4 files changed, 87 insertions(+), 14 deletions(-) create mode 100644 kindle2notion/parsing/WordDetector.py create mode 100644 kindle2notion/parsing/__init__.py create mode 100644 kindle2notion/parsing/enums.py diff --git a/kindle2notion/parsing.py b/kindle2notion/parsing.py index a871810..68412af 100644 --- a/kindle2notion/parsing.py +++ b/kindle2notion/parsing.py @@ -1,8 +1,12 @@ +from datetime import datetime from re import findall from typing import Dict, List, Tuple from dateparser import parse +from kindle2notion.parsing.WordDetector import WordDetector +from kindle2notion.parsing.enums import NoteEnum, PageEnum, LocationEnum, DateAddedEnum, Locale, Word + BOOKS_WO_AUTHORS = [] ACADEMIC_TITLES = [ @@ -80,6 +84,8 @@ DELIMITERS = ["; ", " & ", " and "] +WORD_DETECTOR = WordDetector([language for language in Locale]) + def parse_raw_clippings_text(raw_clippings_text: str) -> Dict: raw_clippings_list = raw_clippings_text.split("==========") @@ -126,28 +132,33 @@ def _parse_page_location_date_and_note( second_line_as_list = second_line.strip().split(" | ") page = location = date = "" is_note = False - # TODO: Handle multi-language through: element in Pages.ENUM.values - # one enum for each of notes, location, pages, added on that contains language versions for element in second_line_as_list: element = element.lower() - if "note" in element: + language: Locale = WORD_DETECTOR.detect(element) + if Word.NOTE.value[language] in element: is_note = True - if "page" in element: - page = element[element.find("page") :].replace("page", "").strip() - if "location" in element: - location = ( - element[element.find("location") :].replace("location", "").strip() + if is_word_in_element(element, language, Word.PAGE): + page = _parse_word_from_element(element, language, Word.PAGE) + if is_word_in_element(element, language, Word.LOCATION): + location = _parse_word_from_element(element, language, Word.LOCATION) + if is_word_in_element(element, language, Word.DATE_ADDED): + date_parsed: datetime = parse( + _parse_word_from_element(element, language, Word.DATE_ADDED) ) - if "added on" in element: - date = parse( - element[element.find("added on") :].replace("added on", "").strip() - ) - # TODO: Handle different format - date = date.strftime("%A, %d %B %Y %I:%M:%S %p") + date = date_parsed.strftime(Word.DATE_FORMAT.value[language]) return page, location, date, is_note +def is_word_in_element(element: str, language: Locale, word: Word): + return word.value[language] in element + + +def _parse_word_from_element(element: str, language: Locale, word: Word): + word_value_in_language = word.value[language] + return element[element.find(word_value_in_language):].replace(word_value_in_language, "").strip() + + def _add_parsed_items_to_all_books_dict( all_books: Dict, title: str, diff --git a/kindle2notion/parsing/WordDetector.py b/kindle2notion/parsing/WordDetector.py new file mode 100644 index 0000000..c9414e8 --- /dev/null +++ b/kindle2notion/parsing/WordDetector.py @@ -0,0 +1,24 @@ +from typing import List + +from kindle2notion.parsing.enums import Word, Locale + + +class WordDetector: + + def __init__(self, languages: List[Locale]): + self.languages = languages + self.language_words = {lang: set() for lang in languages} + + for word in Word: + for lang in word.value: + self.language_words[lang].add(word.value[lang]) + + def detect(self, text): + text_words = set(text.split()) + scores = {lang: 0 for lang in self.languages} + + for lang, words in self.language_words.items(): + matches = text_words & words + scores[lang] += len(matches) + + return max(scores, key=scores.get) diff --git a/kindle2notion/parsing/__init__.py b/kindle2notion/parsing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/kindle2notion/parsing/enums.py b/kindle2notion/parsing/enums.py new file mode 100644 index 0000000..b6174f6 --- /dev/null +++ b/kindle2notion/parsing/enums.py @@ -0,0 +1,38 @@ +from enum import Enum + + +class Locale(Enum): + # Enum containing languages + ENGLISH = "en" + SPANISH = "es" + + def __str__(self): + return self.value + + +class Word(Enum): + # For each word, we have to handle different languages + NOTE = { + Locale.ENGLISH: "note", + Locale.SPANISH: "nota" + } + LOCATION = { + Locale.ENGLISH: "location", + Locale.SPANISH: "posición", + } + PAGE = { + Locale.ENGLISH: "page", + Locale.SPANISH: "página", + } + DATE_ADDED = { + Locale.ENGLISH: "added on", + Locale.SPANISH: "añadido", + } + # Date formats also depend on language + DATE_FORMAT = { + Locale.ENGLISH: "%A, %d %B %Y %I:%M:%S %p", + Locale.SPANISH: "%A, %d %B %Y %I:%M:%S", + } + + def __str__(self, language=Locale.ENGLISH): + return self.value[language] From a7dfb8dc6e0f4c6ef0e4c21fa40671882fa00ec3 Mon Sep 17 00:00:00 2001 From: Victor Garcia Date: Fri, 25 Aug 2023 18:05:56 +0200 Subject: [PATCH 4/7] fix: renamed filename --- kindle2notion/parsing.py | 2 +- kindle2notion/parsing/{WordDetector.py => word_detector.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename kindle2notion/parsing/{WordDetector.py => word_detector.py} (100%) diff --git a/kindle2notion/parsing.py b/kindle2notion/parsing.py index 68412af..29f0d40 100644 --- a/kindle2notion/parsing.py +++ b/kindle2notion/parsing.py @@ -4,7 +4,7 @@ from dateparser import parse -from kindle2notion.parsing.WordDetector import WordDetector +from kindle2notion.parsing.word_detector import WordDetector from kindle2notion.parsing.enums import NoteEnum, PageEnum, LocationEnum, DateAddedEnum, Locale, Word BOOKS_WO_AUTHORS = [] diff --git a/kindle2notion/parsing/WordDetector.py b/kindle2notion/parsing/word_detector.py similarity index 100% rename from kindle2notion/parsing/WordDetector.py rename to kindle2notion/parsing/word_detector.py From 037e4b1a4472c8b9f019665ecf78a9eeb75673f9 Mon Sep 17 00:00:00 2001 From: Victor Garcia Date: Fri, 25 Aug 2023 18:19:33 +0200 Subject: [PATCH 5/7] feat: main working --- kindle2notion/{parsing => languages}/__init__.py | 0 kindle2notion/{parsing => languages}/enums.py | 0 kindle2notion/{parsing => languages}/word_detector.py | 2 +- kindle2notion/parsing.py | 4 ++-- 4 files changed, 3 insertions(+), 3 deletions(-) rename kindle2notion/{parsing => languages}/__init__.py (100%) rename kindle2notion/{parsing => languages}/enums.py (100%) rename kindle2notion/{parsing => languages}/word_detector.py (91%) diff --git a/kindle2notion/parsing/__init__.py b/kindle2notion/languages/__init__.py similarity index 100% rename from kindle2notion/parsing/__init__.py rename to kindle2notion/languages/__init__.py diff --git a/kindle2notion/parsing/enums.py b/kindle2notion/languages/enums.py similarity index 100% rename from kindle2notion/parsing/enums.py rename to kindle2notion/languages/enums.py diff --git a/kindle2notion/parsing/word_detector.py b/kindle2notion/languages/word_detector.py similarity index 91% rename from kindle2notion/parsing/word_detector.py rename to kindle2notion/languages/word_detector.py index c9414e8..42f8e51 100644 --- a/kindle2notion/parsing/word_detector.py +++ b/kindle2notion/languages/word_detector.py @@ -1,6 +1,6 @@ from typing import List -from kindle2notion.parsing.enums import Word, Locale +from kindle2notion.languages.enums import Word, Locale class WordDetector: diff --git a/kindle2notion/parsing.py b/kindle2notion/parsing.py index 29f0d40..35daac0 100644 --- a/kindle2notion/parsing.py +++ b/kindle2notion/parsing.py @@ -4,8 +4,8 @@ from dateparser import parse -from kindle2notion.parsing.word_detector import WordDetector -from kindle2notion.parsing.enums import NoteEnum, PageEnum, LocationEnum, DateAddedEnum, Locale, Word +from kindle2notion.languages.word_detector import WordDetector +from kindle2notion.languages.enums import Locale, Word BOOKS_WO_AUTHORS = [] From 005806dee4ada44ceda4fe446e24d30bf49c8788 Mon Sep 17 00:00:00 2001 From: Victor Garcia Date: Sat, 26 Aug 2023 09:18:55 +0200 Subject: [PATCH 6/7] fix: solved date parsing issues by improving language detection --- kindle2notion/exporting.py | 4 ++-- kindle2notion/languages/enums.py | 4 ++-- kindle2notion/languages/word_detector.py | 10 +++++----- kindle2notion/parsing.py | 3 ++- 4 files changed, 11 insertions(+), 10 deletions(-) diff --git a/kindle2notion/exporting.py b/kindle2notion/exporting.py index 19fc910..b9c397c 100644 --- a/kindle2notion/exporting.py +++ b/kindle2notion/exporting.py @@ -87,7 +87,7 @@ def _add_book_to_notion( enable_book_cover: bool, ): notion = notional.connect(auth=notion_api_auth_token) - last_date = get_last_date_from_string(last_date_string) + last_date = __get_last_date_from_string(last_date_string) # Condition variables title_exists = False @@ -174,7 +174,7 @@ def _add_book_to_notion( return message -def get_last_date_from_string(last_date_string: str) -> datetime: +def __get_last_date_from_string(last_date_string: str) -> datetime: if not last_date_string: return datetime.now() try: diff --git a/kindle2notion/languages/enums.py b/kindle2notion/languages/enums.py index b6174f6..f99737c 100644 --- a/kindle2notion/languages/enums.py +++ b/kindle2notion/languages/enums.py @@ -26,12 +26,12 @@ class Word(Enum): } DATE_ADDED = { Locale.ENGLISH: "added on", - Locale.SPANISH: "añadido", + Locale.SPANISH: "añadido el", } # Date formats also depend on language DATE_FORMAT = { Locale.ENGLISH: "%A, %d %B %Y %I:%M:%S %p", - Locale.SPANISH: "%A, %d %B %Y %I:%M:%S", + Locale.SPANISH: "%A, %d %B %Y %H:%M:%S", } def __str__(self, language=Locale.ENGLISH): diff --git a/kindle2notion/languages/word_detector.py b/kindle2notion/languages/word_detector.py index 42f8e51..2b38126 100644 --- a/kindle2notion/languages/word_detector.py +++ b/kindle2notion/languages/word_detector.py @@ -14,11 +14,11 @@ def __init__(self, languages: List[Locale]): self.language_words[lang].add(word.value[lang]) def detect(self, text): - text_words = set(text.split()) scores = {lang: 0 for lang in self.languages} - for lang, words in self.language_words.items(): - matches = text_words & words - scores[lang] += len(matches) - + scores[lang] = sum([len(word) for word in words if self.has_word(text, word)]) return max(scores, key=scores.get) + + def has_word(self, text, word): + return word.lower() in text.lower() + diff --git a/kindle2notion/parsing.py b/kindle2notion/parsing.py index 35daac0..12a850d 100644 --- a/kindle2notion/parsing.py +++ b/kindle2notion/parsing.py @@ -142,8 +142,9 @@ def _parse_page_location_date_and_note( if is_word_in_element(element, language, Word.LOCATION): location = _parse_word_from_element(element, language, Word.LOCATION) if is_word_in_element(element, language, Word.DATE_ADDED): + date_string = _parse_word_from_element(element, language, Word.DATE_ADDED) date_parsed: datetime = parse( - _parse_word_from_element(element, language, Word.DATE_ADDED) + date_string, languages=[language.value for language in Locale] ) date = date_parsed.strftime(Word.DATE_FORMAT.value[language]) From d0de5e662e0c9570b808ba87736b11e471a8936b Mon Sep 17 00:00:00 2001 From: Victor Garcia Date: Sat, 26 Aug 2023 09:21:09 +0200 Subject: [PATCH 7/7] fix: missed change in fix datetime --- kindle2notion/exporting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kindle2notion/exporting.py b/kindle2notion/exporting.py index b9c397c..930af65 100644 --- a/kindle2notion/exporting.py +++ b/kindle2notion/exporting.py @@ -181,7 +181,7 @@ def __get_last_date_from_string(last_date_string: str) -> datetime: return datetime.strptime(last_date_string, "%A, %d %B %Y %I:%M:%S %p") except ValueError: # Datetime format is not English, retrying with non AM-PM format - return datetime.strptime(last_date_string, "%A, %d %B %Y %I:%M:%S") + return datetime.strptime(last_date_string, "%A, %d %B %Y %H:%M:%S") # def _create_rich_text_object(text): # if "Note: " in text: