methods can return list, no need for a decorator #32

Web handles redirects even if they are not accessible Signed-off-by: Edvard Rejthar <[email protected]>
CZ-NIC · Oct 31, 2019 · d8629b2 · d8629b2
1 parent 2f1e693
commit d8629b2
Show file tree

Hide file tree

Showing 6 changed files with 111 additions and 119 deletions.
diff --git a/README.md b/README.md
@@ -160,17 +160,16 @@ def any_method(value):
 
 You may as well hard code custom fields in the [`config.ini`](convey/config.ini.default) by providing paths to the entrypoint Python files delimited by a comma: `custom_fields_modules = /tmp/myfile.py, /tmp/anotherfile.py`. All the public methods in the defined files will become custom fields!
 
-If you need a single call to generate multiple rows, return list and decorate with @duplicate_row.
+If you need a single call to generate multiple rows, return list, the row accepting a list will be duplicated.
 
 ```python3
-from convey import duplicate_row
-
-@duplicate_row
 def any_method(value):
     # do something
     return ["foo", "bar"]
 ```
 
+Ex: If a method returns 2 items and another 3 items, you will receive 6 similar rows.
+
 Handsome feature if you're willing to use the Shodan API as our partner or to do anything else.
 
 ## Examples

diff --git a/convey/__init__.py b/convey/__init__.py
@@ -1,6 +1,2 @@
 # import Config file as the very first one (setup logging)
-from .config import Config
-from .identifier import MultipleRows
-duplicate_row = MultipleRows.duplicate_row
-
-__all__ = ["duplicate_row"]
+from .config import Config
diff --git a/convey/__main__.py b/convey/__main__.py
@@ -54,14 +54,16 @@ def application(env, start_response):
     if not WebServer.source_parser:
         from convey.config import Config
         from convey.parser import Parser
+        from convey.identifier import Types
+        Types.init()
         Config.init()
         WebServer.source_parser = Parser(prepare=False)
 
     headers = [('Access-Control-Allow-Origin', '*')]
     t = env["QUERY_STRING"].split("q=")  # XX sanitize?
     if len(t) == 2:
         res = WebServer.source_parser.set_stdin([t[1]]).prepare()
-        if res.single_value:
+        if res.is_single_value:
             response = res.run_single_value(json=True)
             headers.append(('Content-Type', 'application/json'))
             status = '200 OK'

diff --git a/convey/identifier.py b/convey/identifier.py
@@ -2,7 +2,6 @@
 import importlib.util
 import ipaddress
 import logging
-import operator
 import re
 import socket
 import subprocess
@@ -11,7 +10,6 @@
 from copy import copy
 from csv import Error, Sniffer, reader
 from enum import IntEnum
-from functools import wraps, reduce
 from pathlib import Path
 from typing import List
 
@@ -48,72 +46,17 @@ def check_ip(ip):
         return False
 
 
-def prod(iterable):  # XX as of Python3.8, replace with math.prod
-    return reduce(operator.mul, iterable, 1)
-
-
-class MultipleRows:
-    el_on_row: List["MultipleRows"] = []
-
-    def __init__(self, func):
-        """   # every column using @duplicate_row has a dict cache allowing it to loop result from an inner lambda
-
-        """
-        self.queue = []
-        self.pos = 0
-        self.func = func
-        self.el_on_row.append(self)
-
-    def is_last(self):
-        """ This is the last MultipleRows field on the processed row """
-        return self.el_on_row[-1] == self
-
-    def run(self):
-        @wraps(self.func)
-        def func_wrapper(val):
-            if self.pos == 0:
-                self.queue.extend(copy(self.func(val)))  # load new list from the inner lambda
-                if not self.queue:  # lambda returned an empty list - this is not a fail, return empty string then
-                    self.queue.append("")
-                if self.is_last():
-                    max_ = prod([len(el.queue) for el in self.el_on_row])
-                    for el in self.el_on_row:  # 1st col returns 3 values, 2nd 2 values → both should have 3*2 = 6 values
-                        el.queue *= max_ // len(self.queue)
-            v = self.queue[self.pos]
-            self.pos += 1
-            if self.pos == len(self.queue):
-                self.queue.clear()
-                self.pos = 0
-                return v
-            if self.is_last():  # tuple indicates that Processor should duplicate row because we have some values in the queues
-                return v,
-            return v
-
-        return func_wrapper
-
-    @staticmethod
-    def clear():
-        """ Clear caches that ex allow a column using @duplicate_row to loop result from an inner lambda """
-        MultipleRows.el_on_row.clear()
-
-    @staticmethod
-    def duplicate_row(func):
-        """ Decorate a function that computes a list instead of a scalar. """
-        return MultipleRows, func
-
-
 class Checker:
     """ To not pollute the namespace, we put the methods here """
 
     hostname_ips_cache = {}
     hostname_cache = {}
 
     @staticmethod
-    @MultipleRows.duplicate_row
     def hostname2ips(hostname):
         if hostname not in Checker.hostname_ips_cache:
             try:
-                Checker.hostname_ips_cache[hostname] = {addr[4][0] for addr in socket.getaddrinfo(hostname, None)}
+                Checker.hostname_ips_cache[hostname] = list({addr[4][0] for addr in socket.getaddrinfo(hostname, None)})
             except OSError:
                 Checker.hostname_ips_cache[hostname] = []
         return Checker.hostname_ips_cache[hostname]
@@ -198,24 +141,43 @@ def __init__(self, url):
         if url in self.cache:
             self.get = self.cache[url]
             return
-        try:
-            logger.info(f"Scrapping {url}...")
-            response = requests.get(url, timeout=3, headers=self.headers)
-        except IOError as e:
-            self.get = str(e), None, None, None
-        else:
-            response.encoding = response.apparent_encoding  # https://stackoverflow.com/a/52615216/2036148
-            if self.store_text:
-                soup = BeautifulSoup(response.text, features="html.parser")
-                [s.extract() for s in soup(["style", "script", "head"])]  # remove tags with low probability of content
-                text = re.sub(r'\n\s*\n', '\n', soup.text)  # reduce multiple new lines to singles
-                text = re.sub(r'[^\S\r\n][^\S\r\n]*[^\S\r\n]', ' ', text)  # reduce multiple spaces (not new lines) to singles
+        logger.info(f"Scrapping {url}...")
+        redirects = []
+        current_url = url
+        while True:
+            try:
+                response = requests.get(current_url, timeout=3, headers=self.headers, allow_redirects=False)
+            except IOError as e:
+                if isinstance(e, requests.exceptions.HTTPError):
+                    s = "Http error"
+                elif isinstance(e, requests.exceptions.ConnectionError):
+                    s = "Error Connecting"
+                elif isinstance(e, requests.exceptions.Timeout):
+                    s = "Timeout Error:"
+                elif isinstance(e, requests.exceptions.RequestException):
+                    s = "Oops : Something Else"
+                else:
+                    s = e
+                self.get = str(s), None, None, redirects
+                break
+            if response.headers.get("Location"):
+                current_url = response.headers.get("Location")
+                redirects.append(current_url)
+                continue
             else:
-                text = None
-            redirects = ""
-            for res in response.history[1:]:
-                redirects = f"REDIRECT {res.status_code} → {res.url}\n" + text
-            self.get = response.status_code, text, response.text if self.store_html else None, redirects
+                response.encoding = response.apparent_encoding  # https://stackoverflow.com/a/52615216/2036148
+                if self.store_text:
+                    soup = BeautifulSoup(response.text, features="html.parser")
+                    [s.extract() for s in soup(["style", "script", "head"])]  # remove tags with low probability of content
+                    text = re.sub(r'\n\s*\n', '\n', soup.text)  # reduce multiple new lines to singles
+                    text = re.sub(r'[^\S\r\n][^\S\r\n]*[^\S\r\n]', ' ', text)  # reduce multiple spaces (not new lines) to singles
+                else:
+                    text = None
+                # for res in response.history[1:]:
+                #     redirects += f"REDIRECT {res.status_code} → {res.url}\n" + text
+                #     redirects.append(res.url)
+                self.get = response.status_code, text, response.text if self.store_html else None, redirects
+                break
         self.cache[url] = self.get
 
 
@@ -351,7 +313,7 @@ def check_conformity(self, samples, has_header, field):
             s = str(field).replace(" ", "").replace("'", "").replace('"', "").lower()
             for n in self.usual_names:
                 if s in n or n in s:
-                    print("HEADER match", field, self, self.usual_names)
+                    # print("HEADER match", field, self, self.usual_names)
                     score += 2 if self.usual_must_match else 1
                     break
         if not score and self.usual_must_match:
@@ -504,7 +466,7 @@ def _get_methods():
             # portIP: IP written with a port 91.222.204.175.23 -> 91.222.204.175
             (t.url, t.hostname): Whois.url2hostname,
             (t.hostname, t.ip): Checker.hostname2ips if Config.get("multiple_ips_from_hostname", "FIELDS") else Checker.hostname2ip,
-            (t.url, t.ip): Whois.url2ip,
+            # (t.url, t.ip): Whois.url2ip,
             (t.ip, t.whois): Whois,
             (t.cidr, t.ip): lambda x: str(ipaddress.ip_interface(x).ip),
             (t.whois, t.prefix): lambda x: (x, str(x.get[0])),
@@ -559,7 +521,7 @@ def __init__(self, parser):
                 if module:
                     for method in (x for x in dir(module) if not x.startswith("_")):
                         methods[(Types.plaintext, method)] = getattr(module, method)
-                        logger.info("Successfully added method {method} from module {path}")
+                        logger.info(f"Successfully added method {method} from module {path}")
             except Exception as e:
                 s = "Can't import custom file from path: {}".format(path)
                 input(s + ". Press any key...")
@@ -634,9 +596,7 @@ def reg_s_method(s):
         lambdas = []  # list of lambdas to calculate new field
         for i in range(len(path) - 1):
             lambda_ = methods[path[i], path[i + 1]]
-            if type(lambda_) == tuple and lambda_[0] == MultipleRows:
-                lambda_ = MultipleRows(lambda_[1]).run()
-            elif not hasattr(lambda_, "__call__"):  # the field is invisible, see help text for Types; may be False, None or True
+            if not hasattr(lambda_, "__call__"):  # the field is invisible, see help text for Types; may be False, None or True
                 continue
             lambdas.append(lambda_)
 
@@ -815,6 +775,9 @@ def get_fitting_source(self, target_type: Type, *task):
         source_type = None
         task = list(task)
 
+        if Config.is_debug():
+            print(f"Getting type {target_type} with args {task}")
+
         # determining source_col_i from a column candidate
         column_candidate = task.pop(0) if len(task) else None
         if column_candidate:  # determine COLUMN
@@ -868,10 +831,17 @@ def get_fitting_source(self, target_type: Type, *task):
             print(f"No suitable column found for field '{target_type}'")
             quit()
 
-        f = self.parser.fields[source_col_i]
+        try:
+            f = self.parser.fields[source_col_i]
+        except IndexError:
+            print(f"Column ID {source_col_i + 1} does not exist, only these: " + ", ".join(f.name for f in self.parser.fields))
+            quit()
         if graph.dijkstra(target_type, start=source_type) is False:
             print(f"No suitable path from '{f.name}' treated as '{source_type}' to '{target_type}'")
             quit()
+
+        if Config.is_debug():
+            print(f"Got type {target_type} of field={f}, source_type={source_type}, custom={task}")
         return f, source_type, task
 
     def get_column_i(self, column):

diff --git a/convey/parser.py b/convey/parser.py
@@ -18,7 +18,7 @@
 from .config import Config, get_terminal_size
 from .contacts import Contacts, Attachment
 from .dialogue import Cancelled, is_yes, ask
-from .identifier import Identifier, b64decode, Types, Type, Web, TypeGroup, MultipleRows
+from .identifier import Identifier, b64decode, Types, Type, Web, TypeGroup
 from .informer import Informer
 from .processor import Processor
 from .whois import Whois
@@ -342,8 +342,7 @@ def write(self, row):
             cw.writerow([f for f in self.fields if f.is_chosen])
             self.header = wr.written
         self._reset_output()
-        self.get_sample_values()  # assure sout_info would consume a result from duplicate_row
-        MultipleRows.clear()
+        #self.get_sample_values()  # assure sout_info would consume a result from duplicate_row
 
         self.time_start = None
         self.time_end = None