-
Notifications
You must be signed in to change notification settings - Fork 481
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
24 additions
and
14 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,27 +1,34 @@ | ||
import re | ||
|
||
from .base import RegexBasedDetector | ||
|
||
class EmailAddressDetector(RegexBasedDetector): | ||
"""Email Address Detector. | ||
|
||
This class is designed to efficiently and accurately detect email addresses within given text. It primarily | ||
validates the general format of email addresses, and does not adhere strictly to email format standards such as RFC 5322. | ||
class EmailAddressDetector(RegexBasedDetector): | ||
""" | ||
A detector for identifying email addresses within text. It uses regular expressions to | ||
focus on general email structures, not strictly adhering to standards like RFC 5322. | ||
Designed for efficient and broad detection, it also has some limitations. | ||
Key Features: | ||
- Ignores common, non-security-threatening email addresses to enhance precision. | ||
Features: | ||
- Detects a wide range of email formats efficiently. | ||
- Ignores common, non-critical emails to minimize false positives. | ||
Limitations: | ||
- Despite robust detection mechanisms, the class is not infallible and may not cover all edge cases. | ||
- It does not support some examples from RFC 6530, e.g., email addresses with Greek alphabets. | ||
- May miss edge cases or unconventional email formats. | ||
- Not compliant with advanced formats, e.g., RFC 6530 non-Latin emails. | ||
References: | ||
Regular Expression: | ||
Utilizes a regex pattern focusing on typical email components: local part, domain, TLD. | ||
Excludes predefined whitelist emails to reduce false positives. | ||
References: | ||
- https://en.wikipedia.org/wiki/Email_address | ||
- https://stackoverflow.com/a/14321045 | ||
""" | ||
secret_type = 'Email Address' | ||
|
||
whitelist = ['[email protected]', '[email protected]'] | ||
# Excluses whitelist email addresses from detection to reduce false positives. | ||
whitelist = ['[email protected]', '[email protected]'] | ||
|
||
base_pattern = r""" | ||
[\w+-]+ # Local part before the @ symbol | ||
|
@@ -32,21 +39,23 @@ class EmailAddressDetector(RegexBasedDetector): | |
(?:\.[a-zA-Z]{2,4}) # TLD part | ||
""" | ||
# Pattern Breakdown: | ||
# 1. [\w+-]+: Matches one or more of a-z, A-Z, _, +, - | ||
# 1. [\w+-]+: Matches one or more of a-z, A-Z, _, +, - | ||
# Represents the local part of the email address before the @ symbol. | ||
# 2. (?:\.[\w+-]+)*: Matches zero or more of a-z, A-Z, _, +, -, but must start with a . (dot) | ||
# Allows for dot-separated words in the local part of the email address. | ||
# 3. @: Matches the @ symbol. | ||
# 4. [\w+-]+: Matches one or more of a-z, A-Z, _, +, - | ||
# 4. [\w+-]+: Matches one or more of a-z, A-Z, _, +, - | ||
# Represents the domain part of the email address after the @ symbol. | ||
# 5. (?:\.[\w+-]+)*: Matches zero or more of a-z, A-Z, _, +, -, but must start with a . (dot) | ||
# Allows for dot-separated words in the domain part of the email address. | ||
# 6. (?:\.[a-zA-Z]{2,4}): Matches 2 to 4 instances of a-z, A-Z, starting with a . (dot) | ||
# Represents the TLD (top-level domain) part of the email address. | ||
|
||
deny_pattern = r"(?!" + "|".join(re.escape(email) for email in whitelist) + r"$)" + base_pattern | ||
deny_pattern = r'(?!' \ | ||
+ '|'.join(re.escape(email) for email in whitelist) \ | ||
+ r'$)' + base_pattern | ||
# Combines the base pattern with a negative lookahead to exclude whitelist email addresses. | ||
|
||
denylist = [ | ||
re.compile(r"\b" + deny_pattern + r"\b", flags=re.VERBOSE) | ||
re.compile(r'\b' + deny_pattern + r'\b', flags=re.VERBOSE), | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,5 @@ | ||
import pytest | ||
|
||
from detect_secrets.plugins.email_address import EmailAddressDetector | ||
|
||
|
||
|