Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make FileSystemHandler #4

Merged
merged 1 commit into from
Jul 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,11 @@ jobs:

steps:
- uses: actions/checkout@v3
- name: Checkout Geoconnex Namespace
- name: Checkout Cached Geoconnex Namespace
run: |
git clone -b master https://github.com/internetofwater/geoconnex.us.git geoconnex.us
cd geoconnex.us
git checkout e0e1c2ba0d023bfd80f3e6d76c85c01fab35c581
- uses: actions/setup-python@v2
name: Setup Python ${{ matrix.python-version }}
with:
Expand Down
51 changes: 51 additions & 0 deletions sitemap_generator/handler/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# =================================================================
#
# Authors: Benjamin Webb <[email protected]>
#
# Copyright (c) 2023 Benjamin Webb
#
# Permission is hereby granted, free of charge, to any person
# obtaining a copy of this software and associated documentation
# files (the "Software"), to deal in the Software without
# restriction, including without limitation the rights to use,
# copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following
# conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
#
# =================================================================

'''Handler classs'''

import click

from sitemap_generator.handler.filesystem import FileSystemHandler
from sitemap_generator.util import OPTION_VERBOSITY


@click.command()
@click.pass_context
@OPTION_VERBOSITY
@click.argument('filepath', type=click.Path())
@click.option('-s', '--uri_stem', type=str, default='https://geoconnex.us/',
help='uri stem to be removed from short url for keyword')
def run(ctx, verbosity, filepath, uri_stem):
if filepath.is_dir():
handler = FileSystemHandler(filepath, uri_stem)
handler.handle()


if __name__ == '__main__':
run()
124 changes: 38 additions & 86 deletions sitemap_generator/handler.py → sitemap_generator/handler/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,7 @@
#
# =================================================================

import click

from datetime import datetime as dt
from git import Repo
from datetime import datetime
import logging
import os
from pathlib import Path
Expand All @@ -39,24 +36,16 @@

from sitemap_generator.util import (url_join, get_smi, add_smi_node,
get_urlset, add_urlset_node,
write_tree, walk_path,
parse, OPTION_VERBOSITY)
write_tree)

LOGGER = logging.getLogger(__name__)

# Environment Vars for Git Repository to source last mod
SOURCE_REPO = os.environ.get('SOURCE_REPO', '/geoconnex.us')
SOURCE_REPO_PATH = os.environ.get('SOURCE_REPO_PATH', 'namespaces')
# Git Repository objects
REPO = Repo(SOURCE_REPO)
TREE = REPO.heads.master.commit.tree
NAMESPACE = TREE / SOURCE_REPO_PATH

# Sitemap directory objects
SITEMAP_DIR = Path(os.environ.get('SITEMAP_DIR', '/sitemap'))


class Handler:
class BaseHandler:
"""Sitemap Generator Handler"""

def __init__(self, filepath: Path, uri_stem: str) -> None:
Expand All @@ -77,16 +66,35 @@ def handle(self) -> None:
:returns: `None`
"""
raise NotImplementedError

def parse(self) -> None:
"""
Parse sitemap creation sitemapindex
:returns: `None`
"""
raise NotImplementedError

def get_filetime(self, filename: Path) -> str:
"""
Gets relative path to file.
:param filename: `Path` of file
:returns file_time: `str` of file lastmod as W3C Datetime
"""
raise NotImplementedError

LOGGER.debug('Making urlsets')
[self.make_urlset(file)
for file in walk_path(self.root_path, r'.*.csv')]
def get_rel_path(self, filename: Path) -> str:
"""
Gets relative path to file.
LOGGER.debug('Making sitemap index')
urlsets = walk_path(self.root_path, r'.*.xml')
self.make_sitemap(urlsets)
:param filename: `Path` of file
LOGGER.debug('Finished task')
:returns parent: `str` of parent path
"""
raise NotImplementedError

def make_urlset(self, filename: Path) -> None:
"""
Expand All @@ -97,8 +105,8 @@ def make_urlset(self, filename: Path) -> None:
:returns: `None`
"""
LOGGER.debug(f'Making urlset for {filename}')
file_time = self._get_filetime(filename)
urlsets = parse(filename)
file_time = self.get_filetime(filename)
urlsets = self.parse(filename)

for i, chunk in enumerate(urlsets):
# Build sitemaps for each csv file
Expand All @@ -115,6 +123,11 @@ def make_urlset(self, filename: Path) -> None:
sitemap_file = (filename.parent / fidx).with_suffix('.xml')
write_tree(tree, sitemap_file)

_ = datetime.strptime(file_time, '%Y-%m-%dT%H:%M:%SZ')
mtime = _.timestamp()
atime = datetime.now().timestamp()
os.utime(sitemap_file, (atime, mtime))

def make_sitemap(self, files: Iterator[Path]) -> None:
"""
Create sitemapindex
Expand All @@ -135,78 +148,17 @@ def make_sitemap(self, files: Iterator[Path]) -> None:
continue

# Move xml to /sitemaps
filepath = (SITEMAP_DIR / self._get_rel_path(f))
filepath = (SITEMAP_DIR / self.get_rel_path(f))
filepath.mkdir(parents=True, exist_ok=True)
file_path = filepath / f.name
LOGGER.debug(f'Copying urlset to {filepath}')
copy2(f, file_path)

# create to link /sitemap/_sitemap.xml
file_time = self._get_filetime(file_path)
file_time = self.get_filetime(file_path)
url_ = url_join(self.uri_stem, file_path)
add_smi_node(root, url_, file_time)

sitemap_out = SITEMAP_DIR / '_sitemap.xml'
LOGGER.debug(f'Writing sitemapindex to {sitemap_out}')
write_tree(tree, sitemap_out)

def _get_filetime(self, filename: Path) -> str:
"""
Gets relative path to file.
:param filename: `Path` of file
:returns file_time: `str` of file lastmod as W3C Datetime
"""
try:
LOGGER.debug('Getting filetime from Git commit')
blob = (NAMESPACE / self._get_rel_path(filename))
commits = REPO.iter_commits(paths=blob.path, max_count=1)
commit = next(commits)
file_time = commit.committed_datetime

except KeyError as err:
LOGGER.warning(err)
_ = os.path.getmtime(filename)
file_time = dt.fromtimestamp(_)

except OSError as err:
LOGGER.warning(err)
file_time = dt.now()

return file_time.strftime('%Y-%m-%dT%H:%M:%SZ')

def _get_rel_path(self, filename: Path) -> str:
"""
Gets relative path to file.
:param filename: `Path` of file
:returns parent: `str` of parent path
"""
full_path = str(filename.resolve())
LOGGER.debug(f'Resolving relative path for {full_path}')
if self.root_path in full_path:
LOGGER.debug('File in namespaces context')
parent = filename.parent.relative_to(self.root_path)
else:
LOGGER.debug('File in sitemap context')
parent = filename.parent.relative_to(SITEMAP_DIR)

LOGGER.debug(f'Parent dir of file is: {parent}')
return str(parent)


@click.command()
@click.pass_context
@OPTION_VERBOSITY
@click.argument('filepath', type=click.Path())
@click.option('-s', '--uri_stem', type=str, default='https://geoconnex.us/',
help='uri stem to be removed from short url for keyword')
def run(ctx, verbosity, filepath, uri_stem):
handler = Handler(filepath, uri_stem)
handler.handle()


if __name__ == '__main__':
run()
134 changes: 134 additions & 0 deletions sitemap_generator/handler/filesystem.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
# =================================================================
#
# Authors: Benjamin Webb <[email protected]>
#
# Copyright (c) 2023 Benjamin Webb
#
# Permission is hereby granted, free of charge, to any person
# obtaining a copy of this software and associated documentation
# files (the "Software"), to deal in the Software without
# restriction, including without limitation the rights to use,
# copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following
# conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
#
# =================================================================

from datetime import datetime as dt
from git import Repo
import logging
import os
from pathlib import Path

from sitemap_generator.handler.base import BaseHandler, SITEMAP_DIR
from sitemap_generator.util import walk_path, parse

LOGGER = logging.getLogger(__name__)

# Environment Vars for Git Repository to source last mod
SOURCE_REPO = os.environ.get('SOURCE_REPO', '/geoconnex.us')
SOURCE_REPO_PATH = os.environ.get('SOURCE_REPO_PATH', 'namespaces')


class FileSystemHandler(BaseHandler):
def __init__(self, filepath: Path, uri_stem: str) -> None:
"""
Sitemap handler initializer
:param filepath: `Path` of filepath to handle
:param uri_stem: `str` of sitemap location
:returns: `None`
"""
super().__init__(filepath, uri_stem)
# Git Repository objects
self.repo = Repo(SOURCE_REPO)
self.tree = self.repo.heads.master.commit.tree
self.namespace = self.tree / SOURCE_REPO_PATH

def handle(self) -> None:
"""
Handle sitemap creation sitemapindex
:returns: `None`
"""
LOGGER.debug('Making urlsets')
[self.make_urlset(file)
for file in walk_path(self.root_path, r'.*.csv')]

LOGGER.debug('Making sitemap index')
urlsets = walk_path(self.root_path, r'.*.xml')
self.make_sitemap(urlsets)

LOGGER.debug('Finished task')

def parse(self, filename: Path, n: int = 50000) -> list:
"""
Parses file to a CSV
:param filename: `Path` of source file to parse
:param n: `int` size of each chunk
:returns: `list`
"""
return parse(filename, n)

def get_filetime(self, filename: Path) -> str:
"""
Gets relative path to file.
:param filename: `Path` of file
:returns file_time: `str` of file lastmod as W3C Datetime
"""
try:
LOGGER.debug(f'Getting filetime from Git commit for {filename}')
relative_path = self.get_rel_path(filename)
blob = (self.namespace / relative_path / filename.name)
commits = self.repo.iter_commits(paths=blob.path, max_count=1)
commit = next(commits)
file_time = commit.committed_datetime

except KeyError as err:
LOGGER.warning(err)
_ = os.path.getmtime(filename)
file_time = dt.fromtimestamp(_)

except OSError as err:
LOGGER.warning(err)
file_time = dt.now()

return file_time.strftime('%Y-%m-%dT%H:%M:%SZ')

def get_rel_path(self, filename: Path) -> str:
"""
Gets relative path to file.
:param filename: `Path` of file
:returns parent: `str` of parent path
"""
full_path = str(filename.resolve())
LOGGER.debug(f'Resolving relative path for {full_path}')
if self.root_path in full_path:
LOGGER.debug('File in namespaces context')
parent = filename.parent.relative_to(self.root_path)
else:
LOGGER.debug('File in sitemap context')
parent = filename.parent.relative_to(SITEMAP_DIR)

LOGGER.debug(f'Parent dir of file is: {parent}')
return str(parent)
Loading
Loading