Skip to content

Commit

Permalink
Merge pull request #81 from Synse/end-punc
Browse files Browse the repository at this point in the history
Add backtick to `END_PUNCTUATION` regex
  • Loading branch information
pedramamini authored Jul 30, 2024
2 parents 5045ef2 + 2b868a5 commit b737020
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 1 deletion.
2 changes: 1 addition & 1 deletion iocextract.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
from urllib import unquote

# Reusable end punctuation regex
END_PUNCTUATION = r"[\.\?>\"'\)!,}:;\u201d\u2019\uff1e\uff1c\]]*"
END_PUNCTUATION = r"[\.\?>\"'`\)!,}:;\u201d\u2019\uff1e\uff1c\]]*"

# Reusable regex for symbols commonly used to defang
SEPARATOR_DEFANGS = r"[\(\)\[\]{}<>\\]"
Expand Down
19 changes: 19 additions & 0 deletions tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -627,6 +627,25 @@ def test_url_extraction_handles_punctuation(self):
self.assertEqual(list(iocextract.extract_urls('example[.]com/)'))[0], 'example[.]com/')
self.assertEqual(list(iocextract.extract_urls('example[.]com/\''))[0], 'example[.]com/')

def test_url_extraction_end_punctuation(self):
"""
Any url ending with a punctuation character defined in `END_PUNCTUATION` should be extracted without the punctuation.
"""
end_punctuation = ['.', '?', '>', '"', '\'', '`', ')', '!', ',', '}', ':', ';', '\u201d', '\u2019', '\uff1e', '\uff1c', ']']
unicode_end = ['\u201d', '\u2019', '\uff1e', '\uff1c']

# extracted url should not end with the punctuation
for end in end_punctuation:
self.assertEqual(list(iocextract.extract_urls('https://example.com' + end))[0], 'https://example.com')

# extracted url should still contain the punctuation if it is mid-url
for end in end_punctuation:
# skip unicode punctuation
if end in unicode_end:
continue

self.assertEqual(list(iocextract.extract_urls('https://example.com/a' + end + 'b'))[0], 'https://example.com/a' + end + 'b')

def test_hex_url_extraction(self):
if sys.version_info[0] == 3:
hexconvert = lambda x: str(binascii.hexlify(bytes(x, 'ascii')), 'ascii')
Expand Down

0 comments on commit b737020

Please sign in to comment.