Skip to content

Commit

Permalink
Initial functional commit adds documentation, CLI and library.
Browse files Browse the repository at this point in the history
  • Loading branch information
Tom Thorogood committed Mar 29, 2022
1 parent 01fab0c commit 59ff6f0
Show file tree
Hide file tree
Showing 9 changed files with 683 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
__pycache__
dist
*.pyc
81 changes: 81 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# Fingerprinter

This utility library can be used to create SHA256 fingerprints
using globs and paths. It is meant to be a replacement
for the `sources/fingerprints.sh` in
[common-build-scripts].

## Installation

- using **pip**: `pip install git+https://github.com/UWIT-IAM/[email protected]`
- using **poetry**: `fingerprinter = { git = "https://github.com/UWIT-IAM/fingerprinter.git", tag="v0.1" }`


## Use

```
python -m fingerprinter.cli --help
```


### Configuration File

To get started, you'll need a configuration file. This file is a yaml file
that defines your fingerprint targets.

```yaml
targets:
target-name:
include-paths:
- src/**.py # Glob to match all python files recursively under a directory
- src/ # Will match every file under src/, recursively. (Same as 'src/**')
- src # interchangeable with `src/` or `src/**`
- src/foo.py # Include a specific file
```
You may also declare other targets as dependencies:
```yaml
# This example has a source fingerprint that is generated for all python files
# under the src/ directory, but the fingerprint is dependent on the
# dependency locks. This means that even if all python files remain
# untouched, an update to the dependencies will generate a new
# source fingerprint.
# `fingerprints.yaml` is also included here to ensure that changes
# to the actual fingerprint configuration regenerates all fingerprints.
targets:
dependencies:
include-paths:
- poetry.lock
- fingerprints.yaml
source:
depends-on: [dependencies]
include-paths: ['src/**.py']
```
**All paths will be lexicographically sorted at runtime**, however dependencies
are always resolved in the order provided.
`python -m fingerprinter.cli -f fingerprints.yaml -t source` will do the rest!

### Excluding Files

There may be some paths that you never want to consider. For instance `__pycache__` is
always excluded by default, no matter where it falls.

You can exclude paths at the base of your yaml:

```yaml
ignore-paths:
- __pycache__ # Never necessary, this path is always ignored
- .secrets # Entire directory will always be ignored wherever it is in the tree
- secret.py # Will be ignored in every directory it exists in.
targets:
foo:
# Will include src/foo/bar, but not src/.secrets/sekret or src/foo/__pycache__/blah
include-paths: ['src']
```


[common-build-scripts]: https://github.com/uwit-iam/common-build-scripts
2 changes: 2 additions & 0 deletions fingerprinter/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .fingerprinter import Fingerprinter
from .models import FingerprintConfig, FingerprintTarget
48 changes: 48 additions & 0 deletions fingerprinter/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import argparse
import logging
import yaml

from . import Fingerprinter
from .models import FingerprintConfig


def get_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
"Given a target and an input yaml, returns the target's SHA-256 hash.\n"
"--config-file and --target are always required.\n"
)
parser.add_argument('--config-file', '-f', required=True,
help='The config file you want to use to generate fingerprints.')
parser.add_argument('--target', '-t', required=True,
help='The target from the config file whose fingerprint you want to get')
parser.add_argument('--verbose', '-v', action='store_true', default=False,
help='Set log level to INFO')
parser.add_argument('--debug', '-g', action='store_true', default=False,
help='Set log level to DEBUG')
return parser


def load_yaml(filename: str) -> FingerprintConfig:
with open(filename) as f:
return FingerprintConfig.parse_obj(yaml.load(f, Loader=yaml.SafeLoader))


def main():
args = get_parser().parse_args()
config = load_yaml(args.config_file)
log_level = logging.WARNING

if args.verbose:
log_level = logging.INFO
if args.debug:
log_level = logging.DEBUG

logging.basicConfig(level=log_level)
logging.debug("Starting in DEBUG mode")

fp = Fingerprinter(config)
print(fp.get_fingerprint(args.target))


if __name__ == "__main__":
main()
102 changes: 102 additions & 0 deletions fingerprinter/fingerprinter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
import glob
import hashlib

__all__ = [
'Fingerprinter'
]

import logging

import os
from typing import List

from .models import FingerprintConfig

class Fingerprinter:
def __init__(self, config: FingerprintConfig):
self.config = config
self.path_cache = {}
self.ignored_paths = {'__pycache__'}
self.ignored_paths.update(self.config.ignore_paths)
self.included_paths = set()

def resolve_path(self, path: str) -> List[str]:
if path not in self.path_cache:
if os.path.isfile(path):
self.path_cache[path] = [path]
elif os.path.isdir(path):
path = os.path.join(path, '*')
self.path_cache[path] = sorted(glob.glob(path))
return self.path_cache.get(path, [])

@staticmethod
def get_file_sha256sum(filename: str) -> bytes:
"""
Reads target files block by block to avoid reading
them into memory all at once; supposedly this is efficient.
Taken from: https://stackoverflow.com/a/44873382/677283
:param filename: The name of the file you want to hash
:return: The file's sha256sum
"""
h = hashlib.sha256()
b = bytearray(128*1024)
mv = memoryview(b)
with open(filename, 'rb', buffering=0) as f:
for n in iter(lambda: f.readinto(mv), 0):
h.update(mv[:n])
return h.hexdigest().encode('UTF-8')

def path_is_ignored(self, filename: str) -> bool:
"""
Determines whether a path should be included in the fingerprint.
Each path is only checked once; after that, its status as ignored
or included is cached, to avoid having to re-parse matching globs
over and over and over again.
"""
if filename in self.included_paths:
return False

if filename not in self.ignored_paths:
for p in self.ignored_paths:

if (
# /foo/bar/baz.py will be ignore if 'foo/*' is ignored
('*' in p and filename in glob.glob(p))
# /foo/bar/baz.py will be ignored if 'baz.py' is ignored
or os.path.basename(filename) == p
# /foo/bar/baz.py will be ignored if '/foo/bar' is ignored
or os.path.dirname(filename) == p
):
self.ignored_paths.add(filename)

if filename in self.ignored_paths:
return True

self.included_paths.add(filename)
return False

def get_path_fingerprint(self, path: str) -> bytes:
h = hashlib.sha256()
for fn in sorted(self.resolve_path(path)):
if os.path.isdir(fn):
h.update(self.get_path_fingerprint(fn))
elif os.path.isfile(fn):
logging.debug(f"Getting fingerprint for file: {fn}")
h.update(self.get_file_sha256sum(fn))
return h.hexdigest().encode('UTF-8')

def get_fingerprint_bytes(self, target: str) -> bytes:
return self.get_fingerprint(target).encode('UTF-8')

def get_fingerprint(self, target: str) -> str:
logging.debug(f"Getting fingerprint for {target}")
target = self.config.targets[target] # Raises KeyError
h = hashlib.sha256()

for dep in target.depends_on:
h.update(self.get_fingerprint_bytes(dep))

for path in sorted(target.include_paths):
h.update(self.get_path_fingerprint(path))

return h.hexdigest()
16 changes: 16 additions & 0 deletions fingerprinter/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from typing import Dict, List

from pydantic import BaseModel, Field


class FingerprintTarget(BaseModel):
depends_on: List[str] = Field(default_factory=lambda: [], alias='depends-on')

# All directory paths are recursive.
# Every element is a glob
include_paths: List[str] = Field(default_factory=lambda: [], alias='include-paths')


class FingerprintConfig(BaseModel):
ignore_paths: List[str] = Field(default_factory=lambda: ['__pycache__'], alias='ignore-paths')
targets: Dict[str, FingerprintTarget]
17 changes: 17 additions & 0 deletions fingerprints.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
```yaml
# This example has a source fingerprint that is generated for all python files
# under the src/ directory, but the fingerprint is dependent on the
# dependency locks. This means that even if all python files remain
# untouched, an update to the dependencies will generate a new
# source fingerprint.
# `fingerprints.yaml` is also included here to ensure that changes
# to the actual fingerprint configuration regenerates all fingerprints.
targets:
dependencies:
include-paths:
- poetry.lock
- fingerprints.yaml
source:
depends-on: [dependencies]
include-paths: ['fingerprinter/**.py']
```
Loading

0 comments on commit 59ff6f0

Please sign in to comment.