Skip to content

Commit

Permalink
Re-add text file check in get_file_fingerprint_hashes #5
Browse files Browse the repository at this point in the history
    * Use open instead of codecs.open

Signed-off-by: Jono Yang <[email protected]>
  • Loading branch information
JonoYang committed Apr 15, 2024
1 parent 287f8ec commit a9316c2
Showing 1 changed file with 4 additions and 4 deletions.
8 changes: 4 additions & 4 deletions src/matchcode_toolkit/fingerprinting.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@
#

import binascii
import codecs
import re

from commoncode import filetype
from licensedcode.tokenize import ngrams
from typecode.contenttype import get_type

from matchcode_toolkit.halohash import BitAverageHaloHash

Expand Down Expand Up @@ -212,11 +212,11 @@ def get_file_fingerprint_hashes(location, ngram_length=8, **kwargs):
Return an empty mapping if `location` is not a text file
"""
# Do not process `location` if it's not a text file
if not filetype.is_file(location):
ft = get_type(location)
if not (filetype.is_file(location) and ft.is_text):
return {}

# TODO: Check for robust text-reading code in license and copyright detection
with codecs.open(location, encoding='utf-8') as f:
with open(location, encoding='utf-8') as f:
content = f.read()

# Break content into words, then create ngrams from words
Expand Down

0 comments on commit a9316c2

Please sign in to comment.