diff --git a/src/matchcode_toolkit/fingerprinting.py b/src/matchcode_toolkit/fingerprinting.py index 8f7dc4d..649569a 100644 --- a/src/matchcode_toolkit/fingerprinting.py +++ b/src/matchcode_toolkit/fingerprinting.py @@ -8,11 +8,11 @@ # import binascii -import codecs import re from commoncode import filetype from licensedcode.tokenize import ngrams +from typecode.contenttype import get_type from matchcode_toolkit.halohash import BitAverageHaloHash @@ -212,11 +212,11 @@ def get_file_fingerprint_hashes(location, ngram_length=8, **kwargs): Return an empty mapping if `location` is not a text file """ # Do not process `location` if it's not a text file - if not filetype.is_file(location): + ft = get_type(location) + if not (filetype.is_file(location) and ft.is_text): return {} - # TODO: Check for robust text-reading code in license and copyright detection - with codecs.open(location, encoding='utf-8') as f: + with open(location, encoding='utf-8') as f: content = f.read() # Break content into words, then create ngrams from words