From 0d9187229ff755fb672e7f0de64c7926c675af7e Mon Sep 17 00:00:00 2001
From: Thomas Juul Dyhr <thomas@dyhr.com>
Date: Fri, 5 Apr 2024 10:50:48 +0200
Subject: [PATCH] tld_check refactoring

---
 TODO.md      | 32 ++++++++++++++++++++++++++++++++
 httpcheck.py | 23 +++++++----------------
 2 files changed, 39 insertions(+), 16 deletions(-)

diff --git a/TODO.md b/TODO.md
index c97cb76..f9ab963 100644
--- a/TODO.md
+++ b/TODO.md
@@ -10,3 +10,35 @@
 * figure out the correct use of pipe with | ie. 'httpcheck -' for piping into httpcheck, see FileType and Nargs within [argparse](https://docs.python.org/3.8/library/argparse.html#nargs)
 * notification of user integration with email, message, popup, notification, phone, see terminal-notifier or just use osascript "display notification" for osx
 * implement threading propperly to show info
+
+## tld check refactoring
+
+ `tld_check` should be modified to use the function `validators.domain()` from the python package `validators` that has built in support for domain name validation, including checking TLDs against a list of public suffixes maintained by Mozilla. 
+
+```python
+import validators
+from urllib.parse import urlparse
+
+def tld_check(url):
+    parsed = urlparse(url)
+    if not all([parsed.scheme, parsed.netloc]):  # if the URL does not have a scheme or netloc
+        raise ValueError('Invalid URL')  
+    domain = parsed.hostname
+    try:
+        validators.domain(domain)
+        return True
+    except validators.ValidationFailure:
+        raise InvalidTLDException(f"[-] Domain not in global list of TLDs: '{url}'")
+```
+Use the function as follows:
+
+```python
+tld_check('https://www.example.com')  # returns True if URL is valid, otherwise raises an exception
+```
+This will check both for a valid domain and that its TLD exists in the Mozilla list of public suffixes. If either condition fails, it raises an exception.
+
+Make sure to install `validators` python package with:
+```sh
+pip install validators
+```
+Might need to replace own implementation for the function `tld_check` if there's no specific reason why you have written this code, as it will be more efficient and accurate. This library does exactly what you asked for, including checking against public suffixes which is not possible with a simple regex or similar methods.
\ No newline at end of file
diff --git a/httpcheck.py b/httpcheck.py
index d4d45b2..83b1e57 100755
--- a/httpcheck.py
+++ b/httpcheck.py
@@ -19,6 +19,7 @@
 import textwrap
 import concurrent.futures
 import requests
+import validators
 
 
 VERSION = "1.2.0"
@@ -208,23 +209,13 @@ def load_tlds(file_path):
                 tlds.append(line)
     return tlds
 
-def tld_check(url, tld_file_path):
-    """Check url for valid TLD against tld file."""
-    tlds = load_tlds(tld_file_path)
 
-    url_elements = urlparse(url).netloc.split('.')
-    for i in range(-len(url_elements), 0):
-        last_i_elements = url_elements[i:]
-        candidate = ".".join(last_i_elements)
-        wildcard_candidate = ".".join(["*"] + last_i_elements[1:])
-        exception_candidate = f"!{candidate}"
-
-        if exception_candidate in tlds:
-            return ".".join(url_elements[i:])
-        if candidate in tlds or wildcard_candidate in tlds:
-            return ".".join(url_elements[i - 1:])
-
-    raise InvalidTLDException(f"[-] Domain not in global list of TLDs: '{url}'")
+def tld_check(url):
+    """Check url for valid TLD."""
+    validation_result = validators.url(url)
+    if validation_result is not True:
+        raise InvalidTLDException(f"[-] Domain not in global list of TLDs: '{url}'")
+    return url
 
 def check_site(site):
     """Check webiste status code."""