From 2c2fdaaa87b5c7ae56f0cc3ccc7a446fa04b1a2a Mon Sep 17 00:00:00 2001 From: Benjamin Webb Date: Wed, 26 Jul 2023 15:54:31 -0400 Subject: [PATCH] Make FileSystemHandler --- .github/workflows/test.yml | 4 +- sitemap_generator/handler/__init__.py | 51 +++++++ .../{handler.py => handler/base.py} | 118 ++++----------- sitemap_generator/handler/filesystem.py | 141 ++++++++++++++++++ tests/test_handler.py | 17 ++- tests/test_util.py | 2 +- 6 files changed, 237 insertions(+), 96 deletions(-) create mode 100644 sitemap_generator/handler/__init__.py rename sitemap_generator/{handler.py => handler/base.py} (62%) create mode 100644 sitemap_generator/handler/filesystem.py diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 58e5a41..93c5ca2 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -13,9 +13,11 @@ jobs: steps: - uses: actions/checkout@v3 - - name: Checkout Geoconnex Namespace + - name: Checkout Cached Geoconnex Namespace run: | git clone -b master https://github.com/internetofwater/geoconnex.us.git geoconnex.us + cd geoconnex.us + git checkout e0e1c2ba0d023bfd80f3e6d76c85c01fab35c581 - uses: actions/setup-python@v2 name: Setup Python ${{ matrix.python-version }} with: diff --git a/sitemap_generator/handler/__init__.py b/sitemap_generator/handler/__init__.py new file mode 100644 index 0000000..86ae2d8 --- /dev/null +++ b/sitemap_generator/handler/__init__.py @@ -0,0 +1,51 @@ +# ================================================================= +# +# Authors: Benjamin Webb +# +# Copyright (c) 2023 Benjamin Webb +# +# Permission is hereby granted, free of charge, to any person +# obtaining a copy of this software and associated documentation +# files (the "Software"), to deal in the Software without +# restriction, including without limitation the rights to use, +# copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +# ================================================================= + +'''Handler classs''' + +import click + +from sitemap_generator.handler.filesystem import FileSystemHandler +from sitemap_generator.util import OPTION_VERBOSITY + + +@click.command() +@click.pass_context +@OPTION_VERBOSITY +@click.argument('filepath', type=click.Path()) +@click.option('-s', '--uri_stem', type=str, default='https://geoconnex.us/', + help='uri stem to be removed from short url for keyword') +def run(ctx, verbosity, filepath, uri_stem): + if filepath.is_dir(): + handler = FileSystemHandler(filepath, uri_stem) + handler.handle() + + +if __name__ == '__main__': + run() diff --git a/sitemap_generator/handler.py b/sitemap_generator/handler/base.py similarity index 62% rename from sitemap_generator/handler.py rename to sitemap_generator/handler/base.py index 1affb27..8a49b24 100644 --- a/sitemap_generator/handler.py +++ b/sitemap_generator/handler/base.py @@ -27,10 +27,6 @@ # # ================================================================= -import click - -from datetime import datetime as dt -from git import Repo import logging import os from pathlib import Path @@ -39,24 +35,16 @@ from sitemap_generator.util import (url_join, get_smi, add_smi_node, get_urlset, add_urlset_node, - write_tree, walk_path, - parse, OPTION_VERBOSITY) + write_tree) LOGGER = logging.getLogger(__name__) -# Environment Vars for Git Repository to source last mod -SOURCE_REPO = os.environ.get('SOURCE_REPO', '/geoconnex.us') -SOURCE_REPO_PATH = os.environ.get('SOURCE_REPO_PATH', 'namespaces') -# Git Repository objects -REPO = Repo(SOURCE_REPO) -TREE = REPO.heads.master.commit.tree -NAMESPACE = TREE / SOURCE_REPO_PATH # Sitemap directory objects SITEMAP_DIR = Path(os.environ.get('SITEMAP_DIR', '/sitemap')) -class Handler: +class BaseHandler: """Sitemap Generator Handler""" def __init__(self, filepath: Path, uri_stem: str) -> None: @@ -77,16 +65,35 @@ def handle(self) -> None: :returns: `None` """ + raise NotImplementedError + + def parse(self) -> None: + """ + Parse sitemap creation sitemapindex + + :returns: `None` + """ + raise NotImplementedError - LOGGER.debug('Making urlsets') - [self.make_urlset(file) - for file in walk_path(self.root_path, r'.*.csv')] + def get_filetime(self, filename: Path) -> str: + """ + Gets relative path to file. - LOGGER.debug('Making sitemap index') - urlsets = walk_path(self.root_path, r'.*.xml') - self.make_sitemap(urlsets) + :param filename: `Path` of file - LOGGER.debug('Finished task') + :returns file_time: `str` of file lastmod as W3C Datetime + """ + raise NotImplementedError + + def get_rel_path(self, filename: Path) -> str: + """ + Gets relative path to file. + + :param filename: `Path` of file + + :returns parent: `str` of parent path + """ + raise NotImplementedError def make_urlset(self, filename: Path) -> None: """ @@ -97,8 +104,8 @@ def make_urlset(self, filename: Path) -> None: :returns: `None` """ LOGGER.debug(f'Making urlset for {filename}') - file_time = self._get_filetime(filename) - urlsets = parse(filename) + file_time = self.get_filetime(filename) + urlsets = self.parse(filename) for i, chunk in enumerate(urlsets): # Build sitemaps for each csv file @@ -135,78 +142,17 @@ def make_sitemap(self, files: Iterator[Path]) -> None: continue # Move xml to /sitemaps - filepath = (SITEMAP_DIR / self._get_rel_path(f)) + filepath = (SITEMAP_DIR / self.get_rel_path(f)) filepath.mkdir(parents=True, exist_ok=True) file_path = filepath / f.name LOGGER.debug(f'Copying urlset to {filepath}') copy2(f, file_path) # create to link /sitemap/_sitemap.xml - file_time = self._get_filetime(file_path) + file_time = self.get_filetime(file_path) url_ = url_join(self.uri_stem, file_path) add_smi_node(root, url_, file_time) sitemap_out = SITEMAP_DIR / '_sitemap.xml' LOGGER.debug(f'Writing sitemapindex to {sitemap_out}') write_tree(tree, sitemap_out) - - def _get_filetime(self, filename: Path) -> str: - """ - Gets relative path to file. - - :param filename: `Path` of file - - :returns file_time: `str` of file lastmod as W3C Datetime - """ - try: - LOGGER.debug('Getting filetime from Git commit') - blob = (NAMESPACE / self._get_rel_path(filename)) - commits = REPO.iter_commits(paths=blob.path, max_count=1) - commit = next(commits) - file_time = commit.committed_datetime - - except KeyError as err: - LOGGER.warning(err) - _ = os.path.getmtime(filename) - file_time = dt.fromtimestamp(_) - - except OSError as err: - LOGGER.warning(err) - file_time = dt.now() - - return file_time.strftime('%Y-%m-%dT%H:%M:%SZ') - - def _get_rel_path(self, filename: Path) -> str: - """ - Gets relative path to file. - - :param filename: `Path` of file - - :returns parent: `str` of parent path - """ - full_path = str(filename.resolve()) - LOGGER.debug(f'Resolving relative path for {full_path}') - if self.root_path in full_path: - LOGGER.debug('File in namespaces context') - parent = filename.parent.relative_to(self.root_path) - else: - LOGGER.debug('File in sitemap context') - parent = filename.parent.relative_to(SITEMAP_DIR) - - LOGGER.debug(f'Parent dir of file is: {parent}') - return str(parent) - - -@click.command() -@click.pass_context -@OPTION_VERBOSITY -@click.argument('filepath', type=click.Path()) -@click.option('-s', '--uri_stem', type=str, default='https://geoconnex.us/', - help='uri stem to be removed from short url for keyword') -def run(ctx, verbosity, filepath, uri_stem): - handler = Handler(filepath, uri_stem) - handler.handle() - - -if __name__ == '__main__': - run() diff --git a/sitemap_generator/handler/filesystem.py b/sitemap_generator/handler/filesystem.py new file mode 100644 index 0000000..3cf411a --- /dev/null +++ b/sitemap_generator/handler/filesystem.py @@ -0,0 +1,141 @@ +# ================================================================= +# +# Authors: Benjamin Webb +# +# Copyright (c) 2023 Benjamin Webb +# +# Permission is hereby granted, free of charge, to any person +# obtaining a copy of this software and associated documentation +# files (the "Software"), to deal in the Software without +# restriction, including without limitation the rights to use, +# copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +# ================================================================= + +from datetime import datetime as dt +from git import Repo +import logging +import os +from pathlib import Path + +from sitemap_generator.handler.base import BaseHandler, SITEMAP_DIR +from sitemap_generator.util import walk_path, parse + +LOGGER = logging.getLogger(__name__) + +# Environment Vars for Git Repository to source last mod +SOURCE_REPO = os.environ.get('SOURCE_REPO', '/geoconnex.us') +SOURCE_REPO_PATH = os.environ.get('SOURCE_REPO_PATH', 'namespaces') + + +class FileSystemHandler(BaseHandler): + def __init__(self, filepath: Path, uri_stem: str) -> None: + """ + Sitemap handler initializer + + :param filepath: `Path` of filepath to handle + :param uri_stem: `str` of sitemap location + + :returns: `None` + """ + super().__init__(filepath, uri_stem) + # Git Repository objects + self.repo = Repo(SOURCE_REPO) + self.tree = self.repo.heads.master.commit.tree + self.namespace = self.tree / SOURCE_REPO_PATH + + def handle(self) -> None: + """ + Handle sitemap creation sitemapindex + + :returns: `None` + """ + LOGGER.debug('Making urlsets') + [self.make_urlset(file) + for file in walk_path(self.root_path, r'.*.csv')] + + LOGGER.debug('Making sitemap index') + urlsets = walk_path(self.root_path, r'.*.xml') + self.make_sitemap(urlsets) + + LOGGER.debug('Finished task') + + def parse(self, filename: Path, n: int = 50000) -> list: + """ + Parses file to a CSV + + :param filename: `Path` of source file to parse + :param n: `int` size of each chunk + + :returns: `list` + """ + return parse(filename, n) + + def get_filetime(self, filename: Path) -> str: + """ + Gets relative path to file. + + :param filename: `Path` of file + + :returns file_time: `str` of file lastmod as W3C Datetime + """ + try: + LOGGER.debug(f'Getting filetime from Git commit for {filename.name}') + relative_path = self.get_rel_path(filename) + blob = (self.namespace / relative_path / filename.name) + commits = self.repo.iter_commits(paths=blob.path, max_count=1) + commit = next(commits) + file_time = commit.committed_datetime + + except KeyError: + try: + blob = (self.namespace / relative_path) + commits = self.repo.iter_commits(paths=blob.path, max_count=1) + commit = next(commits) + file_time = commit.committed_datetime + + except KeyError as err: + LOGGER.warning(err) + _ = os.path.getmtime(filename) + file_time = dt.fromtimestamp(_) + + except OSError as err: + LOGGER.warning(err) + file_time = dt.now() + + return file_time.strftime('%Y-%m-%dT%H:%M:%SZ') + + def get_rel_path(self, filename: Path) -> str: + """ + Gets relative path to file. + + :param filename: `Path` of file + + :returns parent: `str` of parent path + """ + full_path = str(filename.resolve()) + LOGGER.debug(f'Resolving relative path for {full_path}') + if self.root_path in full_path: + LOGGER.debug('File in namespaces context') + parent = filename.parent.relative_to(self.root_path) + else: + LOGGER.debug('File in sitemap context') + parent = filename.parent.relative_to(SITEMAP_DIR) + + LOGGER.debug(f'Parent dir of file is: {parent}') + return str(parent) diff --git a/tests/test_handler.py b/tests/test_handler.py index 39de445..fbe436b 100644 --- a/tests/test_handler.py +++ b/tests/test_handler.py @@ -31,14 +31,15 @@ from pathlib import Path import xml.etree.ElementTree as ET -from sitemap_generator.handler import Handler, SITEMAP_DIR +from sitemap_generator.handler.base import SITEMAP_DIR +from sitemap_generator.handler.filesystem import FileSystemHandler from sitemap_generator.util import walk_path, url_join THIS_DIR = Path(__file__).parent.resolve() NAMESPACE = THIS_DIR / 'data' / 'namespaces' URI_STEM = 'https://geoconnex.us' -HANDLER = Handler(str(NAMESPACE), URI_STEM) +HANDLER = FileSystemHandler(str(NAMESPACE), URI_STEM) def test_handler(): @@ -51,9 +52,9 @@ def test_handler(): def test_sitemapindex(): [sitemapindex] = list(walk_path(SITEMAP_DIR, r'.*_sitemap.xml')) assert sitemapindex.name == '_sitemap.xml' - assert HANDLER._get_rel_path(sitemapindex) == '.' + assert HANDLER.get_rel_path(sitemapindex) == '.' - _ = HANDLER._get_filetime(sitemapindex) + _ = HANDLER.get_filetime(sitemapindex) file_time = datetime.strptime(_, '%Y-%m-%dT%H:%M:%SZ') today = datetime.utcnow().strftime('%Y-%m-%d') assert file_time.strftime('%Y-%m-%d') == today @@ -73,18 +74,18 @@ def test_sitemapindex(): def test_urlset(): [urlset] = list(walk_path(SITEMAP_DIR, r'.*links__0.xml')) assert urlset.name == 'links__0.xml' - assert HANDLER._get_rel_path(urlset) == 'iow' + assert HANDLER.get_rel_path(urlset) == 'iow' - _ = HANDLER._get_filetime(urlset) + _ = HANDLER.get_filetime(urlset) file_time = datetime.strptime(_, '%Y-%m-%dT%H:%M:%SZ') today = datetime.utcnow().strftime('%Y-%m-%d') assert file_time.strftime('%Y-%m-%d') != today - namespace = url_join(URI_STEM, HANDLER._get_rel_path(urlset)) + namespace = url_join(URI_STEM, HANDLER.get_rel_path(urlset)) assert namespace == 'https://geoconnex.us/iow' [urlset] = list(walk_path(SITEMAP_DIR, r'.*autotest1__0.xml')) - file_time = HANDLER._get_filetime(urlset) + file_time = HANDLER.get_filetime(urlset) tree = ET.parse(urlset) root = tree.getroot() diff --git a/tests/test_util.py b/tests/test_util.py index 4b1ee7c..08ab7c1 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -37,7 +37,7 @@ def test_walk_path(): glob = util.walk_path(NAMESPACE, r'.*') - assert len(list(glob)) == 3 + assert len(list(glob)) >= 3 glob = util.walk_path(NAMESPACE, r'.*csv') assert len(list(glob)) == 2