Skip to content

Commit

Permalink
methods can return list, no need for a decorator #32
Browse files Browse the repository at this point in the history
Web handles redirects even if they are not accessible

Signed-off-by: Edvard Rejthar <[email protected]>
  • Loading branch information
e3rd committed Oct 31, 2019
1 parent 2f1e693 commit d8629b2
Show file tree
Hide file tree
Showing 6 changed files with 111 additions and 119 deletions.
7 changes: 3 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -160,17 +160,16 @@ def any_method(value):

You may as well hard code custom fields in the [`config.ini`](convey/config.ini.default) by providing paths to the entrypoint Python files delimited by a comma: `custom_fields_modules = /tmp/myfile.py, /tmp/anotherfile.py`. All the public methods in the defined files will become custom fields!

If you need a single call to generate multiple rows, return list and decorate with @duplicate_row.
If you need a single call to generate multiple rows, return list, the row accepting a list will be duplicated.

```python3
from convey import duplicate_row

@duplicate_row
def any_method(value):
# do something
return ["foo", "bar"]
```

Ex: If a method returns 2 items and another 3 items, you will receive 6 similar rows.

Handsome feature if you're willing to use the Shodan API as our partner or to do anything else.

## Examples
Expand Down
6 changes: 1 addition & 5 deletions convey/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,2 @@
# import Config file as the very first one (setup logging)
from .config import Config
from .identifier import MultipleRows
duplicate_row = MultipleRows.duplicate_row

__all__ = ["duplicate_row"]
from .config import Config
4 changes: 3 additions & 1 deletion convey/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,14 +54,16 @@ def application(env, start_response):
if not WebServer.source_parser:
from convey.config import Config
from convey.parser import Parser
from convey.identifier import Types
Types.init()
Config.init()
WebServer.source_parser = Parser(prepare=False)

headers = [('Access-Control-Allow-Origin', '*')]
t = env["QUERY_STRING"].split("q=") # XX sanitize?
if len(t) == 2:
res = WebServer.source_parser.set_stdin([t[1]]).prepare()
if res.single_value:
if res.is_single_value:
response = res.run_single_value(json=True)
headers.append(('Content-Type', 'application/json'))
status = '200 OK'
Expand Down
134 changes: 52 additions & 82 deletions convey/identifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import importlib.util
import ipaddress
import logging
import operator
import re
import socket
import subprocess
Expand All @@ -11,7 +10,6 @@
from copy import copy
from csv import Error, Sniffer, reader
from enum import IntEnum
from functools import wraps, reduce
from pathlib import Path
from typing import List

Expand Down Expand Up @@ -48,72 +46,17 @@ def check_ip(ip):
return False


def prod(iterable): # XX as of Python3.8, replace with math.prod
return reduce(operator.mul, iterable, 1)


class MultipleRows:
el_on_row: List["MultipleRows"] = []

def __init__(self, func):
""" # every column using @duplicate_row has a dict cache allowing it to loop result from an inner lambda
"""
self.queue = []
self.pos = 0
self.func = func
self.el_on_row.append(self)

def is_last(self):
""" This is the last MultipleRows field on the processed row """
return self.el_on_row[-1] == self

def run(self):
@wraps(self.func)
def func_wrapper(val):
if self.pos == 0:
self.queue.extend(copy(self.func(val))) # load new list from the inner lambda
if not self.queue: # lambda returned an empty list - this is not a fail, return empty string then
self.queue.append("")
if self.is_last():
max_ = prod([len(el.queue) for el in self.el_on_row])
for el in self.el_on_row: # 1st col returns 3 values, 2nd 2 values → both should have 3*2 = 6 values
el.queue *= max_ // len(self.queue)
v = self.queue[self.pos]
self.pos += 1
if self.pos == len(self.queue):
self.queue.clear()
self.pos = 0
return v
if self.is_last(): # tuple indicates that Processor should duplicate row because we have some values in the queues
return v,
return v

return func_wrapper

@staticmethod
def clear():
""" Clear caches that ex allow a column using @duplicate_row to loop result from an inner lambda """
MultipleRows.el_on_row.clear()

@staticmethod
def duplicate_row(func):
""" Decorate a function that computes a list instead of a scalar. """
return MultipleRows, func


class Checker:
""" To not pollute the namespace, we put the methods here """

hostname_ips_cache = {}
hostname_cache = {}

@staticmethod
@MultipleRows.duplicate_row
def hostname2ips(hostname):
if hostname not in Checker.hostname_ips_cache:
try:
Checker.hostname_ips_cache[hostname] = {addr[4][0] for addr in socket.getaddrinfo(hostname, None)}
Checker.hostname_ips_cache[hostname] = list({addr[4][0] for addr in socket.getaddrinfo(hostname, None)})
except OSError:
Checker.hostname_ips_cache[hostname] = []
return Checker.hostname_ips_cache[hostname]
Expand Down Expand Up @@ -198,24 +141,43 @@ def __init__(self, url):
if url in self.cache:
self.get = self.cache[url]
return
try:
logger.info(f"Scrapping {url}...")
response = requests.get(url, timeout=3, headers=self.headers)
except IOError as e:
self.get = str(e), None, None, None
else:
response.encoding = response.apparent_encoding # https://stackoverflow.com/a/52615216/2036148
if self.store_text:
soup = BeautifulSoup(response.text, features="html.parser")
[s.extract() for s in soup(["style", "script", "head"])] # remove tags with low probability of content
text = re.sub(r'\n\s*\n', '\n', soup.text) # reduce multiple new lines to singles
text = re.sub(r'[^\S\r\n][^\S\r\n]*[^\S\r\n]', ' ', text) # reduce multiple spaces (not new lines) to singles
logger.info(f"Scrapping {url}...")
redirects = []
current_url = url
while True:
try:
response = requests.get(current_url, timeout=3, headers=self.headers, allow_redirects=False)
except IOError as e:
if isinstance(e, requests.exceptions.HTTPError):
s = "Http error"
elif isinstance(e, requests.exceptions.ConnectionError):
s = "Error Connecting"
elif isinstance(e, requests.exceptions.Timeout):
s = "Timeout Error:"
elif isinstance(e, requests.exceptions.RequestException):
s = "Oops : Something Else"
else:
s = e
self.get = str(s), None, None, redirects
break
if response.headers.get("Location"):
current_url = response.headers.get("Location")
redirects.append(current_url)
continue
else:
text = None
redirects = ""
for res in response.history[1:]:
redirects = f"REDIRECT {res.status_code}{res.url}\n" + text
self.get = response.status_code, text, response.text if self.store_html else None, redirects
response.encoding = response.apparent_encoding # https://stackoverflow.com/a/52615216/2036148
if self.store_text:
soup = BeautifulSoup(response.text, features="html.parser")
[s.extract() for s in soup(["style", "script", "head"])] # remove tags with low probability of content
text = re.sub(r'\n\s*\n', '\n', soup.text) # reduce multiple new lines to singles
text = re.sub(r'[^\S\r\n][^\S\r\n]*[^\S\r\n]', ' ', text) # reduce multiple spaces (not new lines) to singles
else:
text = None
# for res in response.history[1:]:
# redirects += f"REDIRECT {res.status_code} → {res.url}\n" + text
# redirects.append(res.url)
self.get = response.status_code, text, response.text if self.store_html else None, redirects
break
self.cache[url] = self.get


Expand Down Expand Up @@ -351,7 +313,7 @@ def check_conformity(self, samples, has_header, field):
s = str(field).replace(" ", "").replace("'", "").replace('"', "").lower()
for n in self.usual_names:
if s in n or n in s:
print("HEADER match", field, self, self.usual_names)
# print("HEADER match", field, self, self.usual_names)
score += 2 if self.usual_must_match else 1
break
if not score and self.usual_must_match:
Expand Down Expand Up @@ -504,7 +466,7 @@ def _get_methods():
# portIP: IP written with a port 91.222.204.175.23 -> 91.222.204.175
(t.url, t.hostname): Whois.url2hostname,
(t.hostname, t.ip): Checker.hostname2ips if Config.get("multiple_ips_from_hostname", "FIELDS") else Checker.hostname2ip,
(t.url, t.ip): Whois.url2ip,
# (t.url, t.ip): Whois.url2ip,
(t.ip, t.whois): Whois,
(t.cidr, t.ip): lambda x: str(ipaddress.ip_interface(x).ip),
(t.whois, t.prefix): lambda x: (x, str(x.get[0])),
Expand Down Expand Up @@ -559,7 +521,7 @@ def __init__(self, parser):
if module:
for method in (x for x in dir(module) if not x.startswith("_")):
methods[(Types.plaintext, method)] = getattr(module, method)
logger.info("Successfully added method {method} from module {path}")
logger.info(f"Successfully added method {method} from module {path}")
except Exception as e:
s = "Can't import custom file from path: {}".format(path)
input(s + ". Press any key...")
Expand Down Expand Up @@ -634,9 +596,7 @@ def reg_s_method(s):
lambdas = [] # list of lambdas to calculate new field
for i in range(len(path) - 1):
lambda_ = methods[path[i], path[i + 1]]
if type(lambda_) == tuple and lambda_[0] == MultipleRows:
lambda_ = MultipleRows(lambda_[1]).run()
elif not hasattr(lambda_, "__call__"): # the field is invisible, see help text for Types; may be False, None or True
if not hasattr(lambda_, "__call__"): # the field is invisible, see help text for Types; may be False, None or True
continue
lambdas.append(lambda_)

Expand Down Expand Up @@ -815,6 +775,9 @@ def get_fitting_source(self, target_type: Type, *task):
source_type = None
task = list(task)

if Config.is_debug():
print(f"Getting type {target_type} with args {task}")

# determining source_col_i from a column candidate
column_candidate = task.pop(0) if len(task) else None
if column_candidate: # determine COLUMN
Expand Down Expand Up @@ -868,10 +831,17 @@ def get_fitting_source(self, target_type: Type, *task):
print(f"No suitable column found for field '{target_type}'")
quit()

f = self.parser.fields[source_col_i]
try:
f = self.parser.fields[source_col_i]
except IndexError:
print(f"Column ID {source_col_i + 1} does not exist, only these: " + ", ".join(f.name for f in self.parser.fields))
quit()
if graph.dijkstra(target_type, start=source_type) is False:
print(f"No suitable path from '{f.name}' treated as '{source_type}' to '{target_type}'")
quit()

if Config.is_debug():
print(f"Got type {target_type} of field={f}, source_type={source_type}, custom={task}")
return f, source_type, task

def get_column_i(self, column):
Expand Down
5 changes: 2 additions & 3 deletions convey/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from .config import Config, get_terminal_size
from .contacts import Contacts, Attachment
from .dialogue import Cancelled, is_yes, ask
from .identifier import Identifier, b64decode, Types, Type, Web, TypeGroup, MultipleRows
from .identifier import Identifier, b64decode, Types, Type, Web, TypeGroup
from .informer import Informer
from .processor import Processor
from .whois import Whois
Expand Down Expand Up @@ -342,8 +342,7 @@ def write(self, row):
cw.writerow([f for f in self.fields if f.is_chosen])
self.header = wr.written
self._reset_output()
self.get_sample_values() # assure sout_info would consume a result from duplicate_row
MultipleRows.clear()
#self.get_sample_values() # assure sout_info would consume a result from duplicate_row

self.time_start = None
self.time_end = None
Expand Down
Loading

0 comments on commit d8629b2

Please sign in to comment.