From 5918f022613ffe73ef7908b9c307ff93bf848e14 Mon Sep 17 00:00:00 2001 From: Dmitri Date: Mon, 25 Nov 2024 13:42:35 -0500 Subject: [PATCH 1/6] SFR-2308_PubBacklistMapping --- mappings/UofM.py | 12 -- mappings/publisher_backlist.py | 105 ++++++++++++++++++ .../sources/publisher_backlist_service.py | 8 +- .../processes/test_pub_backlist_mapping.py | 51 +++++++++ 4 files changed, 160 insertions(+), 16 deletions(-) create mode 100644 mappings/publisher_backlist.py create mode 100644 tests/unit/processes/test_pub_backlist_mapping.py diff --git a/mappings/UofM.py b/mappings/UofM.py index 540adf160b..8d060742b1 100644 --- a/mappings/UofM.py +++ b/mappings/UofM.py @@ -83,12 +83,6 @@ def formatSubjects(self): return subjectList def formatRights(self): - ''' - The pipe delimiter is to separate the Rights table attributes into this format: - source|license|reason|statement|date - which makes it easy to place the right data into the columns when clustered - ''' - if not self.record.rights: return None @@ -102,9 +96,3 @@ def formatRights(self): return 'UofM|{}||{}|'.format('public_domain', 'Public Domain') return None - - - - - - diff --git a/mappings/publisher_backlist.py b/mappings/publisher_backlist.py new file mode 100644 index 0000000000..ac636fcaa5 --- /dev/null +++ b/mappings/publisher_backlist.py @@ -0,0 +1,105 @@ +from .json import JSONMapping + +class PublisherBacklistMapping(JSONMapping): + def __init__(self, source): + super().__init__(source, {}) + self.mapping = self.createMapping() + + def createMapping(self): + return { + 'title': ('Title', '{0}'), + 'authors': ('Author(s)', '{0}'), + 'dates': [('Pub Date', '{0}|publication_date')], + 'publisher': [('Publisher (from Projects)', '{0}||')], + 'identifiers': [ + ('ISBN', '{0}|isbn'), + ('OCLC', '{0}|oclc') + ], + 'rights': ('DRB Rights Classification', '{0}||||'), + 'contributors': [('Contributors', '{0}|||contributor')], + 'subjects': ('Subject 1', '{0}'), + 'source': ('Projects', '{0}'), + 'publisher_project_source': ('Publisher (from Projects)', '{0}') + } + + def apply_formatting(self): + self.record.has_part = [] + + if self.record.source: + source_list = self.record.source.split(' ') + print(source_list) + self.record.source = source_list[0] + + if self.record.publisher_project_source: + publisher_source = self.record.publisher_project_source[0] + self.record.publisher_project_source = publisher_source + + if self.record.authors: + self.record.authors = self.format_authors() + + if self.record.subjects: + self.record.subjects = self.format_subjects() + + if self.record.identifiers: + if len(self.record.identifiers) == 1: + source_id = self.record.identifiers[0].split('|')[0] + self.record.source_id = f'{self.record.source}_{source_id}' + self.record.identifiers = self.format_identifiers() + else: + source_id = self.record.identifiers[1].split('|')[0] + self.record.source_id = f'{self.record.source}_{source_id}' + self.record.identifiers = self.format_identifiers() + + self.record.rights = self.format_rights() + + def format_authors(self): + authorList = [] + + if ';' in self.record.authors: + authorList = self.record.authors.split('; ') + newAuthorList = [f'{author}|||true' for author in authorList] + return newAuthorList + else: + authorList.append(f'{self.record.authors}|||true)') + return authorList + + + def format_identifiers(self): + if 'isbn' in self.record.identifiers[0]: + isbnString = self.record.identifiers[0].split('|')[0] + if ';' in isbnString: + isbnList = isbnString.split('; ') + newISBNList = [f'{isbn}|isbn' for isbn in isbnList] + if len(self.record.identifiers) > 1 and 'oclc' in self.record.identifiers[1]: + newISBNList.append(f'{self.record.identifiers[1]}') + return newISBNList + else: + return newISBNList + + return self.record.identifiers + + def format_subjects(self): + subjectList = [] + + if '|' in self.record.subjects: + subjectList = self.record.subjects.split('|') + newSubjectList = [f'{subject}||' for subject in subjectList] + return newSubjectList + else: + subjectList.append(f'{self.record.subjects}||') + return subjectList + + def format_rights(self): + if not self.record.rights: + return None + + rightsElements = self.record.rights.split('|') + rightsStatus = rightsElements[0] + + if rightsStatus == 'in copyright': + return '{}|{}||{}|'.format('self.record.source', 'in_copyright', 'In Copyright') + + if rightsStatus == 'public domain': + return '{}|{}||{}|'.format('self.record.source', 'public_domain', 'Public Domain') + + return None diff --git a/services/sources/publisher_backlist_service.py b/services/sources/publisher_backlist_service.py index 74e5c82326..ea74e3971d 100644 --- a/services/sources/publisher_backlist_service.py +++ b/services/sources/publisher_backlist_service.py @@ -3,10 +3,10 @@ import requests import json import urllib.parse -from typing import Optional, Dict +from typing import Optional from logger import create_log -from mappings.UofM import UofMMapping +from mappings.publisher_backlist import PublisherBacklistMapping from .source_service import SourceService logger = create_log(__name__) @@ -23,14 +23,14 @@ def get_records( start_timestamp: datetime=None, offset: Optional[int]=None, limit: Optional[int]=None - ) -> list[UofMMapping]: + ) -> list[PublisherBacklistMapping]: array_json_records = self.get_records_json(full_import, start_timestamp, offset, limit) for json_dict in array_json_records: for records_value in json_dict['records']: try: record_metadata_dict = records_value - record = UofMMapping(record_metadata_dict) + record = PublisherBacklistMapping(record_metadata_dict) record.applyMapping() except Exception: logger.exception(f'Failed to process Publisher Backlist record') diff --git a/tests/unit/processes/test_pub_backlist_mapping.py b/tests/unit/processes/test_pub_backlist_mapping.py new file mode 100644 index 0000000000..d38853c0b2 --- /dev/null +++ b/tests/unit/processes/test_pub_backlist_mapping.py @@ -0,0 +1,51 @@ +import pytest + +from mappings.publisher_backlist import PublisherBacklistMapping + +class TestPublisherBacklistMapping: + @pytest.fixture + def testMapping(self): + class TestPublisherBacklistMapping(PublisherBacklistMapping): + def __init__(self): + self.mapping = None + + return TestPublisherBacklistMapping() + + @pytest.fixture + def testRecordStandard(self, mocker): + return mocker.MagicMock( + title='testTitle', + authors=['testAuthor|||true'], + dates=['testDate|publication_date'], + publisher=['testPublisher||'], + identifiers=['testISBN|isbn', 'testOCLC|oclc'], + rights='in copyright||||', + contributor=['testContributor|||contributor'], + subjects='testSubject', + source='UofM Press', + publisher_project_source=['University of Michigan'] + ) + + def test_createMapping(self, testMapping): + recordMapping = testMapping.createMapping() + + assert list(recordMapping.keys()) == [ + 'title', 'authors', 'dates', 'publisher', + 'identifiers', 'rights', 'contributors', 'subjects', + 'source', 'publisher_project_source' + ] + assert recordMapping['title'] == ('Title', '{0}') + + def test_apply_formatting_standard(self, testMapping, testRecordStandard): + testMapping.record = testRecordStandard + + testMapping.apply_formatting() + + assert testMapping.record.has_part == [] + assert testMapping.record.source == 'UofM' + assert testMapping.record.identifiers == ['testISBN|isbn', 'testOCLC|oclc'] + assert testMapping.record.source_id == 'UofM_testOCLC' + assert testMapping.record.publisher == ['testPublisher||'] + assert testMapping.record.source == 'UofM' + assert testMapping.record.publisher_project_source == 'University of Michigan' + assert testMapping.record.subjects == ['testSubject||'] From efef4d6f06fda7b1982b07ee5ddbdcc995126d70 Mon Sep 17 00:00:00 2001 From: Dmitri Date: Mon, 25 Nov 2024 14:07:28 -0500 Subject: [PATCH 2/6] Fixed method name --- mappings/publisher_backlist.py | 4 +--- processes/ingest/publisher_backlist.py | 2 +- services/sources/publisher_backlist_service.py | 2 +- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/mappings/publisher_backlist.py b/mappings/publisher_backlist.py index ac636fcaa5..dea2e4e6e8 100644 --- a/mappings/publisher_backlist.py +++ b/mappings/publisher_backlist.py @@ -22,12 +22,10 @@ def createMapping(self): 'publisher_project_source': ('Publisher (from Projects)', '{0}') } - def apply_formatting(self): + def applyFormatting(self): self.record.has_part = [] - if self.record.source: source_list = self.record.source.split(' ') - print(source_list) self.record.source = source_list[0] if self.record.publisher_project_source: diff --git a/processes/ingest/publisher_backlist.py b/processes/ingest/publisher_backlist.py index a2ff9656b1..7954af1bc0 100644 --- a/processes/ingest/publisher_backlist.py +++ b/processes/ingest/publisher_backlist.py @@ -33,7 +33,7 @@ def runProcess(self): else: logger.warning(f'Unknown Publisher Backlist ingestion process type {self.process}') return - + raise Exception for record in records: self.addDCDWToUpdateList(record) diff --git a/services/sources/publisher_backlist_service.py b/services/sources/publisher_backlist_service.py index ea74e3971d..5ba347c823 100644 --- a/services/sources/publisher_backlist_service.py +++ b/services/sources/publisher_backlist_service.py @@ -29,7 +29,7 @@ def get_records( for json_dict in array_json_records: for records_value in json_dict['records']: try: - record_metadata_dict = records_value + record_metadata_dict = records_value['fields'] record = PublisherBacklistMapping(record_metadata_dict) record.applyMapping() except Exception: From 577a45fec3cc58c5d05e28e5f12c255386b29c39 Mon Sep 17 00:00:00 2001 From: Dmitri Date: Mon, 25 Nov 2024 15:04:59 -0500 Subject: [PATCH 3/6] Fixed failed test --- tests/unit/{processes => }/test_pub_backlist_mapping.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) rename tests/unit/{processes => }/test_pub_backlist_mapping.py (94%) diff --git a/tests/unit/processes/test_pub_backlist_mapping.py b/tests/unit/test_pub_backlist_mapping.py similarity index 94% rename from tests/unit/processes/test_pub_backlist_mapping.py rename to tests/unit/test_pub_backlist_mapping.py index d38853c0b2..c3aaf2e791 100644 --- a/tests/unit/processes/test_pub_backlist_mapping.py +++ b/tests/unit/test_pub_backlist_mapping.py @@ -36,10 +36,10 @@ def test_createMapping(self, testMapping): ] assert recordMapping['title'] == ('Title', '{0}') - def test_apply_formatting_standard(self, testMapping, testRecordStandard): + def test_applyFormatting_standard(self, testMapping, testRecordStandard): testMapping.record = testRecordStandard - testMapping.apply_formatting() + testMapping.applyFormatting() assert testMapping.record.has_part == [] assert testMapping.record.source == 'UofM' From 8987fa9c604f8b6bb65f4052cf18f5fad6b4016a Mon Sep 17 00:00:00 2001 From: Dmitri Date: Mon, 25 Nov 2024 15:10:29 -0500 Subject: [PATCH 4/6] Fixed failed test --- processes/ingest/publisher_backlist.py | 1 - services/sources/publisher_backlist_service.py | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/processes/ingest/publisher_backlist.py b/processes/ingest/publisher_backlist.py index 7954af1bc0..199fc6e1c2 100644 --- a/processes/ingest/publisher_backlist.py +++ b/processes/ingest/publisher_backlist.py @@ -33,7 +33,6 @@ def runProcess(self): else: logger.warning(f'Unknown Publisher Backlist ingestion process type {self.process}') return - raise Exception for record in records: self.addDCDWToUpdateList(record) diff --git a/services/sources/publisher_backlist_service.py b/services/sources/publisher_backlist_service.py index 5ba347c823..523bb5e343 100644 --- a/services/sources/publisher_backlist_service.py +++ b/services/sources/publisher_backlist_service.py @@ -96,4 +96,5 @@ def get_records_array(self, pub_backlist_records_response_json = pub_backlist_records_response.json() array_json.append(pub_backlist_records_response_json) + print(len(array_json)) return array_json From 0c10a8b44ee0d8376637bd4faf64be05143a0246 Mon Sep 17 00:00:00 2001 From: Dmitri Date: Tue, 26 Nov 2024 12:47:03 -0500 Subject: [PATCH 5/6] Added source_id to createMapping and retrieved source name from Airtable --- mappings/publisher_backlist.py | 60 +++++++++---------- .../sources/publisher_backlist_service.py | 2 - tests/unit/test_pub_backlist_mapping.py | 14 ++--- 3 files changed, 34 insertions(+), 42 deletions(-) diff --git a/mappings/publisher_backlist.py b/mappings/publisher_backlist.py index dea2e4e6e8..b3507bd6cd 100644 --- a/mappings/publisher_backlist.py +++ b/mappings/publisher_backlist.py @@ -18,14 +18,15 @@ def createMapping(self): 'rights': ('DRB Rights Classification', '{0}||||'), 'contributors': [('Contributors', '{0}|||contributor')], 'subjects': ('Subject 1', '{0}'), - 'source': ('Projects', '{0}'), + 'source': ('Project Name (from Projects)', '{0}'), + 'source_id': ('DRB Record_ID', '{0}'), 'publisher_project_source': ('Publisher (from Projects)', '{0}') } def applyFormatting(self): self.record.has_part = [] if self.record.source: - source_list = self.record.source.split(' ') + source_list = self.record.source[0].split(' ') self.record.source = source_list[0] if self.record.publisher_project_source: @@ -39,65 +40,58 @@ def applyFormatting(self): self.record.subjects = self.format_subjects() if self.record.identifiers: - if len(self.record.identifiers) == 1: - source_id = self.record.identifiers[0].split('|')[0] - self.record.source_id = f'{self.record.source}_{source_id}' - self.record.identifiers = self.format_identifiers() - else: - source_id = self.record.identifiers[1].split('|')[0] - self.record.source_id = f'{self.record.source}_{source_id}' - self.record.identifiers = self.format_identifiers() + self.record.identifiers = self.format_identifiers() self.record.rights = self.format_rights() def format_authors(self): - authorList = [] + author_list = [] if ';' in self.record.authors: - authorList = self.record.authors.split('; ') - newAuthorList = [f'{author}|||true' for author in authorList] - return newAuthorList + author_list = self.record.authors.split('; ') + new_author_list = [f'{author}|||true' for author in author_list] + return new_author_list else: - authorList.append(f'{self.record.authors}|||true)') - return authorList + author_list.append(f'{self.record.authors}|||true)') + return author_list def format_identifiers(self): if 'isbn' in self.record.identifiers[0]: - isbnString = self.record.identifiers[0].split('|')[0] - if ';' in isbnString: - isbnList = isbnString.split('; ') - newISBNList = [f'{isbn}|isbn' for isbn in isbnList] + isbn_string = self.record.identifiers[0].split('|')[0] + if ';' in isbn_string: + isbnList = isbn_string.split('; ') + new_isbn_list = [f'{isbn}|isbn' for isbn in isbnList] if len(self.record.identifiers) > 1 and 'oclc' in self.record.identifiers[1]: - newISBNList.append(f'{self.record.identifiers[1]}') - return newISBNList + new_isbn_list.append(f'{self.record.identifiers[1]}') + return new_isbn_list else: - return newISBNList + return new_isbn_list return self.record.identifiers def format_subjects(self): - subjectList = [] + subject_list = [] if '|' in self.record.subjects: - subjectList = self.record.subjects.split('|') - newSubjectList = [f'{subject}||' for subject in subjectList] - return newSubjectList + subject_list = self.record.subjects.split('|') + new_subject_list = [f'{subject}||' for subject in subject_list] + return new_subject_list else: - subjectList.append(f'{self.record.subjects}||') - return subjectList + subject_list.append(f'{self.record.subjects}||') + return subject_list def format_rights(self): if not self.record.rights: return None - rightsElements = self.record.rights.split('|') - rightsStatus = rightsElements[0] + rights_elements = self.record.rights.split('|') + rights_status = rights_elements[0] - if rightsStatus == 'in copyright': + if rights_status == 'in copyright': return '{}|{}||{}|'.format('self.record.source', 'in_copyright', 'In Copyright') - if rightsStatus == 'public domain': + if rights_status == 'public domain': return '{}|{}||{}|'.format('self.record.source', 'public_domain', 'Public Domain') return None diff --git a/services/sources/publisher_backlist_service.py b/services/sources/publisher_backlist_service.py index 523bb5e343..2c89dfc74b 100644 --- a/services/sources/publisher_backlist_service.py +++ b/services/sources/publisher_backlist_service.py @@ -89,12 +89,10 @@ def get_records_array(self, pub_backlist_records_response = requests.get(url, headers=headers) pub_backlist_records_response_json = pub_backlist_records_response.json() array_json = [pub_backlist_records_response_json] - while 'offset' in pub_backlist_records_response_json: next_page_url = url + f"&offset={pub_backlist_records_response_json['offset']}" pub_backlist_records_response = requests.get(next_page_url, headers=headers) pub_backlist_records_response_json = pub_backlist_records_response.json() array_json.append(pub_backlist_records_response_json) - print(len(array_json)) return array_json diff --git a/tests/unit/test_pub_backlist_mapping.py b/tests/unit/test_pub_backlist_mapping.py index c3aaf2e791..30325f007d 100644 --- a/tests/unit/test_pub_backlist_mapping.py +++ b/tests/unit/test_pub_backlist_mapping.py @@ -22,8 +22,9 @@ def testRecordStandard(self, mocker): rights='in copyright||||', contributor=['testContributor|||contributor'], subjects='testSubject', - source='UofM Press', - publisher_project_source=['University of Michigan'] + source=['UofMichigan Backlist'], + source_id='testSourceID', + publisher_project_source=['University of Michigan Press'] ) def test_createMapping(self, testMapping): @@ -32,7 +33,7 @@ def test_createMapping(self, testMapping): assert list(recordMapping.keys()) == [ 'title', 'authors', 'dates', 'publisher', 'identifiers', 'rights', 'contributors', 'subjects', - 'source', 'publisher_project_source' + 'source', 'source_id', 'publisher_project_source' ] assert recordMapping['title'] == ('Title', '{0}') @@ -42,10 +43,9 @@ def test_applyFormatting_standard(self, testMapping, testRecordStandard): testMapping.applyFormatting() assert testMapping.record.has_part == [] - assert testMapping.record.source == 'UofM' + assert testMapping.record.source == 'UofMichigan' assert testMapping.record.identifiers == ['testISBN|isbn', 'testOCLC|oclc'] - assert testMapping.record.source_id == 'UofM_testOCLC' + assert testMapping.record.source_id == 'testSourceID' assert testMapping.record.publisher == ['testPublisher||'] - assert testMapping.record.source == 'UofM' - assert testMapping.record.publisher_project_source == 'University of Michigan' + assert testMapping.record.publisher_project_source == 'University of Michigan Press' assert testMapping.record.subjects == ['testSubject||'] From 49c14ff87d2adf58ebb6bf0ba60c6e43d552fcc4 Mon Sep 17 00:00:00 2001 From: Dmitri Date: Tue, 26 Nov 2024 13:10:09 -0500 Subject: [PATCH 6/6] Changed some variables to snake case --- mappings/publisher_backlist.py | 13 +++++------ tests/unit/test_pub_backlist_mapping.py | 30 ++++++++++++------------- 2 files changed, 21 insertions(+), 22 deletions(-) diff --git a/mappings/publisher_backlist.py b/mappings/publisher_backlist.py index b3507bd6cd..371a78bb82 100644 --- a/mappings/publisher_backlist.py +++ b/mappings/publisher_backlist.py @@ -60,13 +60,13 @@ def format_identifiers(self): if 'isbn' in self.record.identifiers[0]: isbn_string = self.record.identifiers[0].split('|')[0] if ';' in isbn_string: - isbnList = isbn_string.split('; ') - new_isbn_list = [f'{isbn}|isbn' for isbn in isbnList] + isbns = isbn_string.split('; ') + formatted_isbns = [f'{isbn}|isbn' for isbn in isbns] if len(self.record.identifiers) > 1 and 'oclc' in self.record.identifiers[1]: - new_isbn_list.append(f'{self.record.identifiers[1]}') - return new_isbn_list + formatted_isbns.append(f'{self.record.identifiers[1]}') + return formatted_isbns else: - return new_isbn_list + return formatted_isbns return self.record.identifiers @@ -75,8 +75,7 @@ def format_subjects(self): if '|' in self.record.subjects: subject_list = self.record.subjects.split('|') - new_subject_list = [f'{subject}||' for subject in subject_list] - return new_subject_list + return [f'{subject}||' for subject in subject_list] else: subject_list.append(f'{self.record.subjects}||') return subject_list diff --git a/tests/unit/test_pub_backlist_mapping.py b/tests/unit/test_pub_backlist_mapping.py index 30325f007d..3f81ec75b7 100644 --- a/tests/unit/test_pub_backlist_mapping.py +++ b/tests/unit/test_pub_backlist_mapping.py @@ -4,7 +4,7 @@ class TestPublisherBacklistMapping: @pytest.fixture - def testMapping(self): + def test_mapping(self): class TestPublisherBacklistMapping(PublisherBacklistMapping): def __init__(self): self.mapping = None @@ -27,25 +27,25 @@ def testRecordStandard(self, mocker): publisher_project_source=['University of Michigan Press'] ) - def test_createMapping(self, testMapping): - recordMapping = testMapping.createMapping() + def test_createMapping(self, test_mapping): + record_mapping = test_mapping.createMapping() - assert list(recordMapping.keys()) == [ + assert list(record_mapping.keys()) == [ 'title', 'authors', 'dates', 'publisher', 'identifiers', 'rights', 'contributors', 'subjects', 'source', 'source_id', 'publisher_project_source' ] - assert recordMapping['title'] == ('Title', '{0}') + assert record_mapping['title'] == ('Title', '{0}') - def test_applyFormatting_standard(self, testMapping, testRecordStandard): - testMapping.record = testRecordStandard + def test_applyFormatting_standard(self, test_mapping, testRecordStandard): + test_mapping.record = testRecordStandard - testMapping.applyFormatting() + test_mapping.applyFormatting() - assert testMapping.record.has_part == [] - assert testMapping.record.source == 'UofMichigan' - assert testMapping.record.identifiers == ['testISBN|isbn', 'testOCLC|oclc'] - assert testMapping.record.source_id == 'testSourceID' - assert testMapping.record.publisher == ['testPublisher||'] - assert testMapping.record.publisher_project_source == 'University of Michigan Press' - assert testMapping.record.subjects == ['testSubject||'] + assert test_mapping.record.has_part == [] + assert test_mapping.record.source == 'UofMichigan' + assert test_mapping.record.identifiers == ['testISBN|isbn', 'testOCLC|oclc'] + assert test_mapping.record.source_id == 'testSourceID' + assert test_mapping.record.publisher == ['testPublisher||'] + assert test_mapping.record.publisher_project_source == 'University of Michigan Press' + assert test_mapping.record.subjects == ['testSubject||']