diff --git a/controller.py b/controller.py index 6be3f29..30ee8ca 100644 --- a/controller.py +++ b/controller.py @@ -32,48 +32,7 @@ ''' import utils - -from robot import * -from graph import StateFlowGraph as sfg -from state import StateMachine -from spider import Spider - +import threading import logging -## Set up logger ###################### -logger = logging.getLogger(__name__) -logger.setLevel(logging.DEBUG) - -# create a file handler -handler = logging.FileHandler('debug.log') -handler.setLevel(logging.INFO) - -# create console handler with a higher log level -ch = logging.StreamHandler() -ch.setLevel(logging.ERROR) - -# create a logging format -formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') -handler.setFormatter(formatter) -ch.setFormatter(formatter) - -# add the handlers to the logger -logger.addHandler(handler) -logger.addHandler(ch) - -####################################### - - -def start(Spider): - """ - Controls the start of the Spider instance - """ - - - -def stop(): - - -def pause(): - -def +from main import Config diff --git a/robot.py b/embedded_browser.py similarity index 88% rename from robot.py rename to embedded_browser.py index 3d34ea4..06267cd 100644 --- a/robot.py +++ b/embedded_browser.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python2 +#!/usr/bin/python2 # -*- coding: utf-8 -*- ''' owtf is an OWASP+PTES-focused try to unite great tools and facilitate pen testing @@ -35,57 +35,49 @@ import simplejson as json from lxml import html -from urllib2 import urlopen -import urllib2 - from selenium.webdriver import * from selenium.webdriver.common.keys import Keys from selenium.common.exceptions import * from selenium.webdriver.support.ui import WebDriverWait +from main import Config -## TODO: Possible optimization in form of passing DOM tree to lxml for analysing and parsing +## TODO: Possible optimization in form of passing DOM tree to lxml for analysing and parsing ## Right now, for initial implementation should only include using Selenium to perform DOM tree transversal - #***************************** BUILD your browser here************************************ class WebDriverFactory(object): """ This takes care of building a browser based on config file """ - def __init__(self, config): - with open('config.json') as config: - data = json.load(config) - self.config = data + def __init__(self, Config): + self.Config = Config - def create_webdriver(self, driver): + def create_webdriver(self): """ create a browser based on WebDriverWait """ # handle each case - if driver == "firefox": + if self.Core.CONFIG["driver"] == "firefox": profile = FirefoxProfile() profile.set_preference("network.proxy.type", 1) profile.set_preference("network.proxy.http", "127.0.0.1") profile.set_preference("network.proxy.http_port", "8008") - #use proxy for everything, including localhost profile.set_preference("network.proxy.no_proxies_on", ""); profile.update_preferences() browser = Firefox(firefox_profile=profile) return browser - elif driver == "chrome": + elif self.Core.CONFIG["driver"] == "chrome": options = ChromeOptions() - # set proxy options options.add_arguments("--proxy-server=http://127.0.0.1:8008/") browser = Chrome(executable_path=self.config["chromedriver_path"], options) return browser - elif driver == "phantomjs": - #proxy configuration + elif self.Core.CONFIG["driver"] == "phantomjs": service_args = ( '--proxy=127.0.0.1:8008',\ '--proxy-type=http',\ @@ -98,21 +90,18 @@ def create_webdriver(self, driver): class WebDriverManager(object): - # Config setting to use new webdriver instance per thread. - ENABLE_THREADING_SUPPORT = "browser["threaded"]" - - # Config setting to reuse browser instances between WebdriverManager.new_driver() calls. - INSTANCES = "browser["instances"]" - - - def __init__(self, config, webdriver_factory): - with open('config.json') as config: - data = json.load(config) - self.config = data + def __init__(self, Core, webdriver_factory): + self.Core = Core self.__webdriver = {} # Object with channel as a key self.__registered_drivers = {} self._webdriver_factory = WebDriverFactory() + # Config setting to use new webdriver instance per thread. + ENABLE_THREADING_SUPPORT = self.Core.CONFIG["browser.threaded"] + + # Config setting to reuse browser instances between WebdriverManager.new_driver() calls. + INSTANCES = self.Core.CONFIG["browser.instances"] + def get_driver(self): """ Get an already running instance of Webdriver. If there is none, it will create one. @@ -154,20 +143,15 @@ def new_driver(self): driver = self.__get_driver_for_channel(channel) # if self.__config.get(WebDriverManager.REUSE_BROWSER, True): - if driver is None: - driver = self._webdriver_factory.create_webdriver(# global browser setting) + if driver is None: + driver = self._webdriver_factory.create_webdriver - # Register webdriver so it can be retrieved by the manager and - # cleaned up after exit. - self.__register_driver(channel, driver) - else: - try: - driver.quit() - except: - pass + # Register webdriver so it can be retrieved by the manager and + # cleaned up after exit. + self.__register_driver(channel, driver) - driver = self._webdriver_factory.create_webdriver(# global browser name) - self.__register_driver(channel, driver) + driver = self._webdriver_factory.create_webdriver + self.__register_driver(channel, driver) else: # Attempt to tear down any existing webdriver. @@ -177,7 +161,7 @@ def new_driver(self): except: pass self.__unregister_driver(channel) - driver = self._webdriver_factory.create_webdriver(# global browser) + driver = self._webdriver_factory.create_webdriver self.__register_driver(channel, driver) return driver @@ -235,7 +219,8 @@ class WebDriverAPI(object): Provides a necessary higher-abstraction wrapper around selenium WebDriver """ - def __init__(self, browser): + def __init__(self, Config, browser): + self.CONFIG = Config self.browser = WebDriverManager.get_driver() @staticmethod @@ -296,10 +281,9 @@ def goToURL(self, url): #navigate().to() and get() are synonyms :) self.browser.get(url) handlePopUps() - except WebDriverException, e: + except WebDriverException: pass - except InterruptedException, e: - print "goToUrl got interrupted while waiting for the page to be loaded ", e + except InterruptedException: pass def handlePopUps(self): @@ -309,8 +293,7 @@ def handlePopUps(self): + "window.confirm = function(msg){return true;};" \ + "window.prompt = function(msg){return true;};" \ ) - except UnexpectedAlertPresentException, e: - print "Unexpected Alert element: ", e + except UnexpectedAlertPresentException: pass def goback(self): @@ -347,7 +330,7 @@ def screenshot(self, filename): except Exception, e: print "Error: ", e - def dom(self): + def getDOM(self): return self.browser.page_source # Later define it in the user profiles, or take from owtf general.cfg diff --git a/graph.py b/graph.py deleted file mode 100644 index 4cccaa0..0000000 --- a/graph.py +++ /dev/null @@ -1,127 +0,0 @@ -#!/usr/bin/env python2 -# -*- coding: utf-8 -*- -''' -owtf is an OWASP+PTES-focused try to unite great tools and facilitate pen testing -Copyright (c) 2011, Abraham Aranguren Twitter: @7a_ http://7-a.org -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the copyright owner nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY -DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -* This module defines a controller which manages the start, pause and stop - process of the robot and the state-flow graph engine -''' - -import math -import random -import uuid -from lxml import html - -from robot import * -from utils import * -from state import * - - -class StateFlowGraph(object): - """ - + Defines a event flow graph for DOM states - + The State-Flow Graph is a multi-edge directed graph with states (StateVetex) on the vertices and - + clickables (Eventable) on the edges. - """ - -class Graph: - """A graph connects nodes (verticies) by edges (links). Each edge can also - have a length associated with it. The constructor call is something like: - g = Graph({'A': {'B': 1, 'C': 2}) - this makes a graph with 3 nodes, A, B, and C, with an edge of length 1 from - A to B, and an edge of length 2 from A to C. You can also do: - g = Graph({'A': {'B': 1, 'C': 2}, directed=False) - This makes an undirected graph, so inverse links are also added. The graph - stays undirected; if you add more links with g.connect('B', 'C', 3), then - inverse link is also added. You can use g.nodes() to get a list of nodes, - g.get('A') to get a dict of links out of A, and g.get('A', 'B') to get the - length of the link from A to B. 'Lengths' can actually be any object at - all, and nodes can be any hashable object.""" - - def __init__(self, dict=None, directed=True): - self.dict = dict or {} - self.directed = directed - if not directed: self.make_undirected() - - def make_undirected(self): - "Make a digraph into an undirected graph by adding symmetric edges." - for a in self.dict.keys(): - for (b, distance) in self.dict[a].items(): - self.connect1(b, a, distance) - - def connect(self, A, B, distance=1): - """Add a link from A and B of given distance, and also add the inverse - link if the graph is undirected.""" - self.connect1(A, B, distance) - if not self.directed: self.connect1(B, A, distance) - - def connect1(self, A, B, distance): - "Add a link from A to B of given distance, in one direction only." - self.dict.setdefault(A,{})[B] = distance - - def get(self, a, b=None): - """Return a link distance or a dict of {node: distance} entries. - .get(a,b) returns the distance or None; - .get(a) returns a dict of {node: distance} entries, possibly {}.""" - links = self.dict.setdefault(a, {}) - if b is None: return links - else: return links.get(b) - - def nodes(self): - "Return a list of nodes in the graph." - return self.dict.keys() - -def UndirectedGraph(dict=None): - "Build a Graph where every edge (including future ones) goes both ways." - return Graph(dict=dict, directed=False) - -def RandomGraph(nodes=range(10), min_links=2, width=400, height=300, - curvature=lambda: random.uniform(1.1, 1.5)): - """Construct a random graph, with the specified nodes, and random links. - The nodes are laid out randomly on a (width x height) rectangle. - Then each node is connected to the min_links nearest neighbors. - Because inverse links are added, some nodes will have more connections. - The distance between nodes is the hypotenuse times curvature(), - where curvature() defaults to a random number between 1.1 and 1.5.""" - g = UndirectedGraph() - g.locations = {} - ## Build the cities - for node in nodes: - g.locations[node] = (random.randrange(width), random.randrange(height)) - ## Build roads from each city to at least min_links nearest neighbors. - for i in range(min_links): - for node in nodes: - if len(g.get(node)) < min_links: - here = g.locations[node] - def distance_to_node(n): - if n is node or g.get(node,n): return infinity - return distance(g.locations[n], here) - neighbor = argmin(nodes, distance_to_node) - d = distance(g.locations[neighbor], here) * curvature() - g.connect(node, neighbor, int(d)) - return g - diff --git a/lib/readability/__init__.py b/lib/readability/__init__.py new file mode 100644 index 0000000..8822a51 --- /dev/null +++ b/lib/readability/__init__.py @@ -0,0 +1 @@ +from .readability import Document diff --git a/lib/readability/cleaners.py b/lib/readability/cleaners.py new file mode 100644 index 0000000..d6ef9c8 --- /dev/null +++ b/lib/readability/cleaners.py @@ -0,0 +1,31 @@ +# strip out a set of nuisance html attributes +from lxml.html.clean import Cleaner + +bad_attrs = ['width', 'height', 'style', '[-a-z]*color', 'background[-a-z]*', 'on*'] +single_quoted = "'[^']+'" +double_quoted = '"[^"]+"' +non_space = '[^ "\'>]+' +htmlstrip = re.compile("<" # open + "([^>]+) " # prefix + "(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes + '= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value + "([^>]*)" # postfix + ">" # end +, re.I) + +def clean_attributes(html): + while htmlstrip.search(html): + html = htmlstrip.sub('<\\1\\2>', html) + return html + +def normalize_spaces(s): + if not s: return '' + """replace any sequence of whitespace + characters with a single space""" + return ' '.join(s.split()) + +html_cleaner = Cleaner(scripts=True, javascript=True, comments=True, + style=True, links=True, meta=False, add_nofollow=False, + page_structure=False, processing_instructions=True, embedded=False, + frames=False, forms=False, annoying_tags=False, remove_tags=None, + remove_unknown_tags=False, safe_attrs_only=False) diff --git a/lib/readability/debug.py b/lib/readability/debug.py new file mode 100644 index 0000000..a5e644d --- /dev/null +++ b/lib/readability/debug.py @@ -0,0 +1,25 @@ +def save_to_file(text, filename): + f = open(filename, 'wt') + f.write('') + f.write(text.encode('utf-8')) + f.close() + +uids = {} +def describe(node, depth=2): + if not hasattr(node, 'tag'): + return "[%s]" % type(node) + name = node.tag + if node.get('id', ''): name += '#'+node.get('id') + if node.get('class', ''): + name += '.' + node.get('class').replace(' ','.') + if name[:4] in ['div#', 'div.']: + name = name[3:] + if name in ['tr', 'td', 'div', 'p']: + if not node in uids: + uid = uids[node] = len(uids)+1 + else: + uid = uids.get(node) + name += "%02d" % (uid) + if depth and node.getparent() is not None: + return name+' - '+describe(node.getparent(), depth-1) + return name diff --git a/lib/readability/encoding.py b/lib/readability/encoding.py new file mode 100644 index 0000000..a72c34d --- /dev/null +++ b/lib/readability/encoding.py @@ -0,0 +1,48 @@ +import re +import chardet + +def get_encoding(page): + # Regex for XML and HTML Meta charset declaration + charset_re = re.compile(r']', flags=re.I) + pragma_re = re.compile(r']', flags=re.I) + xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]') + + declared_encodings = (charset_re.findall(page) + + pragma_re.findall(page) + + xml_re.findall(page)) + + # Try any declared encodings + if len(declared_encodings) > 0: + for declared_encoding in declared_encodings: + try: + page.decode(custom_decode(declared_encoding)) + return custom_decode(declared_encoding) + except UnicodeDecodeError: + pass + + # Fallback to chardet if declared encodings fail + text = re.sub(']*>\s*', ' ', page) + enc = 'utf-8' + if not text.strip() or len(text) < 10: + return enc # can't guess + res = chardet.detect(text) + enc = res['encoding'] + #print '->', enc, "%.2f" % res['confidence'] + enc = custom_decode(enc) + return enc + +def custom_decode(encoding): + """Overrides encoding when charset declaration + or charset determination is a subset of a larger + charset. Created because of issues with Chinese websites""" + encoding = encoding.lower() + alternates = { + 'big5': 'big5hkscs', + 'gb2312': 'gb18030', + 'ascii': 'utf-8', + 'MacCyrillic': 'cp1251', + } + if encoding in alternates: + return alternates[encoding] + else: + return encoding \ No newline at end of file diff --git a/lib/readability/htmls.py b/lib/readability/htmls.py new file mode 100644 index 0000000..92598d4 --- /dev/null +++ b/lib/readability/htmls.py @@ -0,0 +1,114 @@ +from cleaners import normalize_spaces, clean_attributes +from encoding import get_encoding +from lxml.html import tostring +import logging +import lxml.html +import re, sys + +utf8_parser = lxml.html.HTMLParser(encoding='utf-8') + +def build_doc(page): + if isinstance(page, unicode): + enc = None + page_unicode = page + else: + enc = get_encoding(page) or 'utf-8' + page_unicode = page.decode(enc, 'replace') + doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser) + return doc, enc + +def js_re(src, pattern, flags, repl): + return re.compile(pattern, flags).sub(src, repl.replace('$', '\\')) + + +def normalize_entities(cur_title): + entities = { + u'\u2014':'-', + u'\u2013':'-', + u'—': '-', + u'–': '-', + u'\u00A0': ' ', + u'\u00AB': '"', + u'\u00BB': '"', + u'"': '"', + } + for c, r in entities.iteritems(): + if c in cur_title: + cur_title = cur_title.replace(c, r) + + return cur_title + +def norm_title(title): + return normalize_entities(normalize_spaces(title)) + +def get_title(doc): + title = doc.find('.//title') + if title is None or len(title.text) == 0: + return '[no-title]' + + return norm_title(title.text) + +def add_match(collection, text, orig): + text = norm_title(text) + if len(text.split()) >= 2 and len(text) >= 15: + if text.replace('"', '') in orig.replace('"', ''): + collection.add(text) + +def shorten_title(doc): + title = doc.find('.//title') + if title is None or title.text is None or len(title.text) == 0: + return '' + + title = orig = norm_title(title.text) + + candidates = set() + + for item in ['.//h1', './/h2', './/h3']: + for e in list(doc.iterfind(item)): + if e.text: + add_match(candidates, e.text, orig) + if e.text_content(): + add_match(candidates, e.text_content(), orig) + + for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title', '.title', '.head', '.heading', '.contentheading', '.small_header_red']: + for e in doc.cssselect(item): + if e.text: + add_match(candidates, e.text, orig) + if e.text_content(): + add_match(candidates, e.text_content(), orig) + + if candidates: + title = sorted(candidates, key=len)[-1] + else: + for delimiter in [' | ', ' - ', ' :: ', ' / ']: + if delimiter in title: + parts = orig.split(delimiter) + if len(parts[0].split()) >= 4: + title = parts[0] + break + elif len(parts[-1].split()) >= 4: + title = parts[-1] + break + else: + if ': ' in title: + parts = orig.split(': ') + if len(parts[-1].split()) >= 4: + title = parts[-1] + else: + title = orig.split(': ', 1)[1] + + if not 15 < len(title) < 150: + return orig + + return title + +def get_body(doc): + [ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ] + raw_html = unicode(tostring(doc.body or doc)) + cleaned = clean_attributes(raw_html) + try: + #BeautifulSoup(cleaned) #FIXME do we really need to try loading it? + return cleaned + except Exception: #FIXME find the equivalent lxml error + #logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned)) + return raw_html diff --git a/lib/readability/readability.py b/lib/readability/readability.py new file mode 100755 index 0000000..bf058ed --- /dev/null +++ b/lib/readability/readability.py @@ -0,0 +1,619 @@ +#!/usr/bin/env python +import logging +import re +import sys + +from collections import defaultdict +from lxml.etree import tostring +from lxml.etree import tounicode +from lxml.html import document_fromstring +from lxml.html import fragment_fromstring + +from cleaners import clean_attributes +from cleaners import html_cleaner +from htmls import build_doc +from htmls import get_body +from htmls import get_title +from htmls import shorten_title + + +logging.basicConfig(level=logging.INFO) +log = logging.getLogger() + + +REGEXES = { + 'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter', re.I), + 'okMaybeItsACandidateRe': re.compile('and|article|body|column|main|shadow', re.I), + 'positiveRe': re.compile('article|body|content|entry|hentry|main|page|pagination|post|text|blog|story', re.I), + 'negativeRe': re.compile('combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget', re.I), + 'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)', re.I), + #'replaceBrsRe': re.compile('(]*>[ \n\r\t]*){2,}',re.I), + #'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I), + #'trimRe': re.compile('^\s+|\s+$/'), + #'normalizeRe': re.compile('\s{2,}/'), + #'killBreaksRe': re.compile('((\s| ?)*){1,}/'), + #'videoRe': re.compile('http:\/\/(www\.)?(youtube|vimeo)\.com', re.I), + #skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i, +} + + +class Unparseable(ValueError): + pass + + +def describe(node, depth=1): + if not hasattr(node, 'tag'): + return "[%s]" % type(node) + name = node.tag + if node.get('id', ''): + name += '#' + node.get('id') + if node.get('class', ''): + name += '.' + node.get('class').replace(' ', '.') + if name[:4] in ['div#', 'div.']: + name = name[3:] + if depth and node.getparent() is not None: + return name + ' - ' + describe(node.getparent(), depth - 1) + return name + + +def to_int(x): + if not x: + return None + x = x.strip() + if x.endswith('px'): + return int(x[:-2]) + if x.endswith('em'): + return int(x[:-2]) * 12 + return int(x) + + +def clean(text): + text = re.sub('\s*\n\s*', '\n', text) + text = re.sub('[ \t]{2,}', ' ', text) + return text.strip() + + +def text_length(i): + return len(clean(i.text_content() or "")) + +regexp_type = type(re.compile('hello, world')) + +def compile_pattern(elements): + if not elements: + return None + if isinstance(elements, regexp_type): + return elements + if isinstance(elements, basestring): + elements = elements.split(',') + return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U) + +class Document: + """Class to build a etree document out of html.""" + TEXT_LENGTH_THRESHOLD = 25 + RETRY_LENGTH = 250 + + def __init__(self, input, positive_keywords=None, negative_keywords=None, **options): + """Generate the document + + :param input: string of the html content. + + kwargs: + - attributes: + - debug: output debug messages + - min_text_length: + - retry_length: + - url: will allow adjusting links to be absolute + - positive_keywords: the list of positive search patterns in classes and ids, for example: ["news-item", "block"] + - negative_keywords: the list of negative search patterns in classes and ids, for example: ["mysidebar", "related", "ads"] + Also positive_keywords and negative_keywords could be a regexp. + """ + self.input = input + self.options = options + self.html = None + self.encoding = None + self.positive_keywords = compile_pattern(positive_keywords) + self.negative_keywords = compile_pattern(negative_keywords) + + def _html(self, force=False): + if force or self.html is None: + self.html = self._parse(self.input) + return self.html + + def _parse(self, input): + doc, self.encoding = build_doc(input) + doc = html_cleaner.clean_html(doc) + base_href = self.options.get('url', None) + if base_href: + doc.make_links_absolute(base_href, resolve_base_href=True) + else: + doc.resolve_base_href() + return doc + + def content(self): + return get_body(self._html(True)) + + def title(self): + return get_title(self._html(True)) + + def short_title(self): + return shorten_title(self._html(True)) + + def get_clean_html(self): + return clean_attributes(tounicode(self.html)) + + def summary(self, html_partial=False): + """Generate the summary of the html docuemnt + + :param html_partial: return only the div of the document, don't wrap + in html and body tags. + + """ + try: + ruthless = True + while True: + self._html(True) + for i in self.tags(self.html, 'script', 'style'): + i.drop_tree() + for i in self.tags(self.html, 'body'): + i.set('id', 'readabilityBody') + if ruthless: + self.remove_unlikely_candidates() + self.transform_misused_divs_into_paragraphs() + candidates = self.score_paragraphs() + + best_candidate = self.select_best_candidate(candidates) + + if best_candidate: + article = self.get_article(candidates, best_candidate, + html_partial=html_partial) + else: + if ruthless: + log.debug("ruthless removal did not work. ") + ruthless = False + self.debug( + ("ended up stripping too much - " + "going for a safer _parse")) + # try again + continue + else: + log.debug( + ("Ruthless and lenient parsing did not work. " + "Returning raw html")) + article = self.html.find('body') + if article is None: + article = self.html + cleaned_article = self.sanitize(article, candidates) + article_length = len(cleaned_article or '') + retry_length = self.options.get( + 'retry_length', + self.RETRY_LENGTH) + of_acceptable_length = article_length >= retry_length + if ruthless and not of_acceptable_length: + ruthless = False + # Loop through and try again. + continue + else: + return cleaned_article + except StandardError, e: + log.exception('error getting summary: ') + raise Unparseable(str(e)), None, sys.exc_info()[2] + + def get_article(self, candidates, best_candidate, html_partial=False): + # Now that we have the top candidate, look through its siblings for + # content that might also be related. + # Things like preambles, content split by ads that we removed, etc. + sibling_score_threshold = max([ + 10, + best_candidate['content_score'] * 0.2]) + # create a new html document with a html->body->div + if html_partial: + output = fragment_fromstring('
') + else: + output = document_fromstring('
') + best_elem = best_candidate['elem'] + for sibling in best_elem.getparent().getchildren(): + # in lxml there no concept of simple text + # if isinstance(sibling, NavigableString): continue + append = False + if sibling is best_elem: + append = True + sibling_key = sibling # HashableElement(sibling) + if sibling_key in candidates and \ + candidates[sibling_key]['content_score'] >= sibling_score_threshold: + append = True + + if sibling.tag == "p": + link_density = self.get_link_density(sibling) + node_content = sibling.text or "" + node_length = len(node_content) + + if node_length > 80 and link_density < 0.25: + append = True + elif node_length <= 80 \ + and link_density == 0 \ + and re.search('\.( |$)', node_content): + append = True + + if append: + # We don't want to append directly to output, but the div + # in html->body->div + if html_partial: + output.append(sibling) + else: + output.getchildren()[0].getchildren()[0].append(sibling) + #if output is not None: + # output.append(best_elem) + return output + + def select_best_candidate(self, candidates): + sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True) + for candidate in sorted_candidates[:5]: + elem = candidate['elem'] + self.debug("Top 5 : %6.3f %s" % ( + candidate['content_score'], + describe(elem))) + + if len(sorted_candidates) == 0: + return None + + best_candidate = sorted_candidates[0] + return best_candidate + + def get_link_density(self, elem): + link_length = 0 + for i in elem.findall(".//a"): + link_length += text_length(i) + #if len(elem.findall(".//div") or elem.findall(".//p")): + # link_length = link_length + total_length = text_length(elem) + return float(link_length) / max(total_length, 1) + + def score_paragraphs(self, ): + MIN_LEN = self.options.get( + 'min_text_length', + self.TEXT_LENGTH_THRESHOLD) + candidates = {} + ordered = [] + for elem in self.tags(self._html(), "p", "pre", "td"): + parent_node = elem.getparent() + if parent_node is None: + continue + grand_parent_node = parent_node.getparent() + + inner_text = clean(elem.text_content() or "") + inner_text_len = len(inner_text) + + # If this paragraph is less than 25 characters + # don't even count it. + if inner_text_len < MIN_LEN: + continue + + if parent_node not in candidates: + candidates[parent_node] = self.score_node(parent_node) + ordered.append(parent_node) + + if grand_parent_node is not None and grand_parent_node not in candidates: + candidates[grand_parent_node] = self.score_node( + grand_parent_node) + ordered.append(grand_parent_node) + + content_score = 1 + content_score += len(inner_text.split(',')) + content_score += min((inner_text_len / 100), 3) + #if elem not in candidates: + # candidates[elem] = self.score_node(elem) + + #WTF? candidates[elem]['content_score'] += content_score + candidates[parent_node]['content_score'] += content_score + if grand_parent_node is not None: + candidates[grand_parent_node]['content_score'] += content_score / 2.0 + + # Scale the final candidates score based on link density. Good content + # should have a relatively small link density (5% or less) and be + # mostly unaffected by this operation. + for elem in ordered: + candidate = candidates[elem] + ld = self.get_link_density(elem) + score = candidate['content_score'] + self.debug("Candid: %6.3f %s link density %.3f -> %6.3f" % ( + score, + describe(elem), + ld, + score * (1 - ld))) + candidate['content_score'] *= (1 - ld) + + return candidates + + def class_weight(self, e): + weight = 0 + for feature in [e.get('class', None), e.get('id', None)]: + if feature: + if REGEXES['negativeRe'].search(feature): + weight -= 25 + + if REGEXES['positiveRe'].search(feature): + weight += 25 + + if self.positive_keywords and self.positive_keywords.search(feature): + weight += 25 + + if self.negative_keywords and self.negative_keywords.search(feature): + weight -= 25 + + if self.positive_keywords and self.positive_keywords.match('tag-'+e.tag): + weight += 25 + + if self.negative_keywords and self.negative_keywords.match('tag-'+e.tag): + weight -= 25 + + return weight + + def score_node(self, elem): + content_score = self.class_weight(elem) + name = elem.tag.lower() + if name == "div": + content_score += 5 + elif name in ["pre", "td", "blockquote"]: + content_score += 3 + elif name in ["address", "ol", "ul", "dl", "dd", "dt", "li", "form"]: + content_score -= 3 + elif name in ["h1", "h2", "h3", "h4", "h5", "h6", "th"]: + content_score -= 5 + return { + 'content_score': content_score, + 'elem': elem + } + + def debug(self, *a): + if self.options.get('debug', False): + log.debug(*a) + + def remove_unlikely_candidates(self): + for elem in self.html.iter(): + s = "%s %s" % (elem.get('class', ''), elem.get('id', '')) + if len(s) < 2: + continue + #self.debug(s) + if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag not in ['html', 'body']: + self.debug("Removing unlikely candidate - %s" % describe(elem)) + elem.drop_tree() + + def transform_misused_divs_into_paragraphs(self): + for elem in self.tags(self.html, 'div'): + # transform
s that do not contain other block elements into + #

s + #FIXME: The current implementation ignores all descendants that + # are not direct children of elem + # This results in incorrect results in case there is an + # buried within an for example + if not REGEXES['divToPElementsRe'].search( + unicode(''.join(map(tostring, list(elem))))): + #self.debug("Altering %s to p" % (describe(elem))) + elem.tag = "p" + #print "Fixed element "+describe(elem) + + for elem in self.tags(self.html, 'div'): + if elem.text and elem.text.strip(): + p = fragment_fromstring('

') + p.text = elem.text + elem.text = None + elem.insert(0, p) + #print "Appended "+tounicode(p)+" to "+describe(elem) + + for pos, child in reversed(list(enumerate(elem))): + if child.tail and child.tail.strip(): + p = fragment_fromstring('

') + p.text = child.tail + child.tail = None + elem.insert(pos + 1, p) + #print "Inserted "+tounicode(p)+" to "+describe(elem) + if child.tag == 'br': + #print 'Dropped
at '+describe(elem) + child.drop_tree() + + def tags(self, node, *tag_names): + for tag_name in tag_names: + for e in node.findall('.//%s' % tag_name): + yield e + + def reverse_tags(self, node, *tag_names): + for tag_name in tag_names: + for e in reversed(node.findall('.//%s' % tag_name)): + yield e + + def sanitize(self, node, candidates): + MIN_LEN = self.options.get('min_text_length', + self.TEXT_LENGTH_THRESHOLD) + for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"): + if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33: + header.drop_tree() + + for elem in self.tags(node, "form", "iframe", "textarea"): + elem.drop_tree() + allowed = {} + # Conditionally clean s,
    s, and
    s + for el in self.reverse_tags(node, "table", "ul", "div"): + if el in allowed: + continue + weight = self.class_weight(el) + if el in candidates: + content_score = candidates[el]['content_score'] + #print '!',el, '-> %6.3f' % content_score + else: + content_score = 0 + tag = el.tag + + if weight + content_score < 0: + self.debug("Cleaned %s with score %6.3f and weight %-3s" % + (describe(el), content_score, weight, )) + el.drop_tree() + elif el.text_content().count(",") < 10: + counts = {} + for kind in ['p', 'img', 'li', 'a', 'embed', 'input']: + counts[kind] = len(el.findall('.//%s' % kind)) + counts["li"] -= 100 + + # Count the text length excluding any surrounding whitespace + content_length = text_length(el) + link_density = self.get_link_density(el) + parent_node = el.getparent() + if parent_node is not None: + if parent_node in candidates: + content_score = candidates[parent_node]['content_score'] + else: + content_score = 0 + #if parent_node is not None: + #pweight = self.class_weight(parent_node) + content_score + #pname = describe(parent_node) + #else: + #pweight = 0 + #pname = "no parent" + to_remove = False + reason = "" + + #if el.tag == 'div' and counts["img"] >= 1: + # continue + if counts["p"] and counts["img"] > counts["p"]: + reason = "too many images (%s)" % counts["img"] + to_remove = True + elif counts["li"] > counts["p"] and tag != "ul" and tag != "ol": + reason = "more
  • s than

    s" + to_remove = True + elif counts["input"] > (counts["p"] / 3): + reason = "less than 3x

    s than s" + to_remove = True + elif content_length < (MIN_LEN) and (counts["img"] == 0 or counts["img"] > 2): + reason = "too short content length %s without a single image" % content_length + to_remove = True + elif weight < 25 and link_density > 0.2: + reason = "too many links %.3f for its weight %s" % ( + link_density, weight) + to_remove = True + elif weight >= 25 and link_density > 0.5: + reason = "too many links %.3f for its weight %s" % ( + link_density, weight) + to_remove = True + elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 1: + reason = "s with too short content length, or too many s" + to_remove = True +# if el.tag == 'div' and counts['img'] >= 1 and to_remove: +# imgs = el.findall('.//img') +# valid_img = False +# self.debug(tounicode(el)) +# for img in imgs: +# +# height = img.get('height') +# text_length = img.get('text_length') +# self.debug ("height %s text_length %s" %(repr(height), repr(text_length))) +# if to_int(height) >= 100 or to_int(text_length) >= 100: +# valid_img = True +# self.debug("valid image" + tounicode(img)) +# break +# if valid_img: +# to_remove = False +# self.debug("Allowing %s" %el.text_content()) +# for desnode in self.tags(el, "table", "ul", "div"): +# allowed[desnode] = True + + #find x non empty preceding and succeeding siblings + i, j = 0, 0 + x = 1 + siblings = [] + for sib in el.itersiblings(): + #self.debug(sib.text_content()) + sib_content_length = text_length(sib) + if sib_content_length: + i =+ 1 + siblings.append(sib_content_length) + if i == x: + break + for sib in el.itersiblings(preceding=True): + #self.debug(sib.text_content()) + sib_content_length = text_length(sib) + if sib_content_length: + j =+ 1 + siblings.append(sib_content_length) + if j == x: + break + #self.debug(str(siblings)) + if siblings and sum(siblings) > 1000: + to_remove = False + self.debug("Allowing %s" % describe(el)) + for desnode in self.tags(el, "table", "ul", "div"): + allowed[desnode] = True + + if to_remove: + self.debug("Cleaned %6.3f %s with weight %s cause it has %s." % + (content_score, describe(el), weight, reason)) + #print tounicode(el) + #self.debug("pname %s pweight %.3f" %(pname, pweight)) + el.drop_tree() + + for el in ([node] + [n for n in node.iter()]): + if not self.options.get('attributes', None): + #el.attrib = {} #FIXME:Checkout the effects of disabling this + pass + + self.html = node + return self.get_clean_html() + + +class HashableElement(): + def __init__(self, node): + self.node = node + self._path = None + + def _get_path(self): + if self._path is None: + reverse_path = [] + node = self.node + while node is not None: + node_id = (node.tag, tuple(node.attrib.items()), node.text) + reverse_path.append(node_id) + node = node.getparent() + self._path = tuple(reverse_path) + return self._path + path = property(_get_path) + + def __hash__(self): + return hash(self.path) + + def __eq__(self, other): + return self.path == other.path + + def __getattr__(self, tag): + return getattr(self.node, tag) + + +def main(): + from optparse import OptionParser + parser = OptionParser(usage="%prog: [options] [file]") + parser.add_option('-v', '--verbose', action='store_true') + parser.add_option('-u', '--url', default=None, help="use URL instead of a local file") + parser.add_option('-p', '--positive-keywords', default=None, help="positive keywords (separated with comma)", action='store') + parser.add_option('-n', '--negative-keywords', default=None, help="negative keywords (separated with comma)", action='store') + (options, args) = parser.parse_args() + + if not (len(args) == 1 or options.url): + parser.print_help() + sys.exit(1) + + file = None + if options.url: + import urllib + file = urllib.urlopen(options.url) + else: + file = open(args[0], 'rt') + enc = sys.__stdout__.encoding or 'utf-8' # XXX: this hack could not always work, better to set PYTHONIOENCODING + try: + print Document(file.read(), + debug=options.verbose, + url=options.url, + positive_keywords = options.positive_keywords, + negative_keywords = options.negative_keywords, + ).summary().encode(enc, 'replace') + finally: + file.close() + +if __name__ == '__main__': + main() diff --git a/main.py b/main.py index 9860c78..05f176e 100644 --- a/main.py +++ b/main.py @@ -33,38 +33,36 @@ import simplejson as json import argparse -#from robot import WebDriverFactory, WebDriverManager +from robot import WebDriverFactory, WebDriverManager RootDir = os.path.dirname(os.path.abspath(sys.argv[0])) or '.' -class Config(object): + +class Core(object): """ - Config class provides the following functions: + Core class provides the following functions: - reads & loads configuration - provides a simple api for webapp - methods for updating config + - manages the app """ - def __init__(self): self.config_file = os.path.join(RootDir, 'configs', 'config.json') - - def read(self): with open(self.config_file) as data: - config = json.load(data) - #print config - return config + CONFIG = json.load(data) + return CONFIG + def logger(self): + """Init loggers, one redirected to a log file, the other to stdout.""" + # Logger for output in console. + log = logging.getLogger('general') + infohandler = logging.StreamHandler(result_queue) + log.setLevel(logging.INFO) + infoformatter = logging.Formatter("%(message)s") + infohandler.setFormatter(infoformatter) + log.addHandler(infohandler) -class Init(object): - """ - Initialises webdriverfactory, loads and reads configuration from file, and creates necessary dirs - """ - - def __init__(self, config): - self.config = Config.read() if __name__ == "__main__": - s = Config() - s.read() - + s = Core() diff --git a/spider.py b/spider.py index 0297cdb..fe02373 100644 --- a/spider.py +++ b/spider.py @@ -1,40 +1,11 @@ #!/usr/bin/env python2 # -*- coding: utf-8 -*- -''' -owtf is an OWASP+PTES-focused try to unite great tools and facilitate pen testing -Copyright (c) 2011, Abraham Aranguren Twitter: @7a_ http://7-a.org -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of the copyright owner nor the - names of its contributors may be used to endorse or promote products - derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY -DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -''' - -import utils -from robot import Browser -import controller -import state +from utils import dom_utils +from main import Core from lxml import html - from selenium.webdriver import * from selenium.webdriver.common.keys import Keys from selenium.common.exceptions import * @@ -43,26 +14,20 @@ class Spider(object): """ - + This is the main crawling engine. - + It will use the robot (browser) module to do the crawling - and will pass on the DOM tree for analysis. - - utils will take care of that - + The state module will provide the necessary functions for - creating state-flow graph. - + At last, the site mirroring and sitemap function will take over. + This is the main crawling engine. + + It will use the robot (browser) module to do the crawling + and will pass on the DOM tree for analysis. + + The state module will provide the necessary functions for + creating state-flow graph. """ - def __init__(self, crawlDepth, base_url, browser): - """ - * Initialize Spider instance - """ - self.site = base_url + def __init__(self, Core, depth, base_url): + self.Core = Core + self.base = base_url self.depth = crawlDepth - self.browser = browser + self.browser = def main(self): - """ - * Main crawler which loads the page, and extracts elements using their tag name - """ + """ Main crawler which loads the page, and simulated user actions. """ self.browser.gotoURL(base_url) diff --git a/state.py b/state_machine.py similarity index 59% rename from state.py rename to state_machine.py index 80f8586..e0ccd97 100644 --- a/state.py +++ b/state_machine.py @@ -29,41 +29,48 @@ ''' from lxml import html -import math -import uuid -from copy import deepcopy -from robot import * -from utils import * -from graph import * +import main +from embedded_browser import +import controller +from stategraph import StateFlowGraph +from utils import dom_utils -class State(object): - """ - * The state class which represents a state in the browser. When iterating over the possible - candidate elements every time a candidate is returned its removed from the list so it is a one - time only access to the candidates. - * Analogous to StateVertex class in Crawljax +class StateMachine(object): + """ The state machine class. """ - + url: the current url of the state - + name: the name of the state - + dom: the current DOM tree of the browser - """ - def __init__(self, browser, name, url, id, candidateElements, failedEvents): - """ - * Describes a state object - """ - self.browser = Browser() - self.name = name - self.url = Browser().current_url - self.id = hashlib.md5(str(uuid.uuid4())).hexdigest() - self.candidateElements = [] - self.failedEvents = failedEvents + def __init__(self, Core, embedded_browser): + self.Core = Core + self.browser = embedded_browser + def initialState(self): + pass -class StateMachine(object): + def currentState(self): + return get_state_by_id(index[0]) + + def newState(self): + dom = self.browser.getDom() + + return stateFlowGraph.new_state(self.browser.get_base_url(), + dom, + dom_utils.normalize(dom) + ) + + # ChangeS the currentState to the nextState if possible. The next state should already be + # present in the graph. + def changeState(self): + if not nextState: + return False + + if StateFlowGraph.can_goto(currentState, nextState): + # next state becomes the current state + currentState() = nextState(); + return True - def __init__(self, graph, initialState): - self.graph = graph - self.initial = initialState + else: + return False + # Adds the newState and the edge between the currentState and the newState on the SFG. + # SFG = stateFlowGraph diff --git a/stategraph.py b/stategraph.py new file mode 100644 index 0000000..86477f3 --- /dev/null +++ b/stategraph.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python2 +# -*- coding: utf-8 -*- +''' +owtf is an OWASP+PTES-focused try to unite great tools and facilitate pen testing +Copyright (c) 2011, Abraham Aranguren Twitter: @7a_ http://7-a.org +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright owner nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +* This module defines a controller which manages the start, pause and stop + process of the robot and the state-flow graph engine +''' +import multiprocessing import process, lock +from lxml import html +import networkx as nx + +from main import Core +from utils import dom_utils + +eventable = [original_state, final_state, event] + + +class state(object): + """ + * The state vertex class which represents a state in the browser. When iterating over the possible + * candidate elements every time a candidate is returned its removed from the list so it is a one + * time only access to the candidates. + """ + + def __init__(self, id, dom, url, name=None, candidate_elements=None): + self.id = id + self.dom = dom + self.stripped_dom = dom_utils.normalize(dom) + self.url = url + self.name = name + self.candidate_elements = candidate_elements + self.visited = False + + +class StateFlowGraph(object): + """ + Defines a event flow graph for DOM states + - The State-Flow Graph is a multi-edge directed graph with states (StateVetex) on the vertices and + - clickables (Eventable) on the edges. + """ + + def __init__(self, Core): + self.Core = Core + self.sfg = nx.MultiDigraph() # the graph is a multi-edged directed graph + self.states = {} + self.edges = {} + + def add_event(self, initial_node, final_node, event): + """ + * Adds the specified edge to this graph, going from the source vertex to the target vertex. + * More formally, adds the specified edge, e, to this graph if this graph contains no edge e2 + * such that e2.equals(e). If this graph already contains such an edge, the call leaves this + * graph unchanged and returns false. Some graphs do not allow edge-multiplicity. In such cases, + * if the graph already contains an edge from the specified source to the specified target, than + * this method does not change the graph and returns false. If the edge was added to the graph, + * returns true. The source and target vertices must already be contained in this graph. + """ + self.Core.logger.info('Adding the edge') + if self.sfg.has_edge(initial_node, final_node): + return False + else: + self.sfg.add_edge(initial_node, final_node, event) + + def add_state(self, state): + self.Core.logger.info('Adding a new state') + if sfg.has_node(state.id): + return False # to speed up, this can also be written as if state.id in self.sfg + else: + self.sfg.add_node() + + def get_clickables(self, state): + return self.sfg.get_node_attributes(state.id) + + def can_goto(self, source, target): + """ Boolean for existence of an edge. """ + # both conditions check because sfg is a directed graph + if self.sfg.has_edge(source, target) or self.sfg.has_edge(target, source): + return True + else: + return False + + def get_shortest_path(self, start, end): + # get the shortestpath using the DijkstraShortestPath algorithm + + return nx.shortest_path(self.sfg, start, end) + + def get_all_states(self): + # in fact, get all nodes as a list + return self.sfg.nodes() + + def get_all_possible_paths(self, start): + return nx.single_source_shortest_path_length(self.sfg, start) + + def visited_states(self, state): + # check the visited flag in the state attributes + visited = [] + for state in self.sfg: + if state.visited: + state.append(visited) + else: + pass diff --git a/utils/dom_utils.py b/utils/dom_utils.py index 16657c7..763d099 100644 --- a/utils/dom_utils.py +++ b/utils/dom_utils.py @@ -39,9 +39,8 @@ from robot import Browser -def getStrippedDOM(html): - """ - + Clean HTML using lxml.html.clean.Cleaner +def normalize(html): + """Normalize the input HTML using lxml.html.clean.Cleaner """ cleaner = Cleaner(comments=True, javascript=True,         scripts=True, safe_attrs_only=True, page_structure=True, @@ -49,6 +48,17 @@ def getStrippedDOM(html): return cleaner.clean_html(html) +# DOM equivalence algorithm +def isequivalent(dom1, dom2): + + hash1 = hashcode(normalize(dom1)) + hash2 = hashcode(normalize(dom2)) + + if hash1 == hash2: + return True + else: + return False + def parse(html): """ + This will convert the html source into a dom object @@ -57,13 +67,10 @@ def parse(html): # Convert html source to dom object # Error catching because of badly formatted HTML, although lxml tends to perform very well :) try: - tree = html.fromstring(html.getStrippedDOM()) # Returns a XML tree + tree = html.fromstring(normalize(html)) return tree except: print "Error in parsing HTML.." - # What to do here? - # This will almost certainly not work here - # make_links_absolute(url) def xpath(expression):         return self.tree.xpath(expression) @@ -75,13 +82,15 @@ def xpath(expression):             selected_elements = map(lambda x: x.text, selected_elements)         return selected_elements -def hashcode(html): +# Computes a hashcode from string to compare if 2 DOMs are equivalent +def hashcode(string): """ + Calculates a hash based on html string """ - string = html.fromstring(html).tostring() return hashlib.md5(string).hexdigest() +# Implemented in crawljax; not too efficient +# It is too uptight on equivalence - would probably lead to state explosion def levenshtein(string1, string2): """ + Measures the amount of difference between two strings. @@ -127,8 +136,3 @@ def minEditDist(dom1, dom2): distance[i-1][j-1]+substCost(dom1[j-1],dom2[i-1])) return distance[n][m] -def diff(dom1, dom2): - """ - + Compares 2 stripped DOMs (based on lxml.html implementation) - """ - return html.diff.htmldiff(dom1, dom2) diff --git a/utils/file_utils.py b/utils/file_utils.py index 8b313bf..16c0e8c 100644 --- a/utils/file_utils.py +++ b/utils/file_utils.py @@ -33,13 +33,15 @@ import os import sys +import time +import errno -def ensure_dir(dir_path): +def ensure_dir(dir_path): if not os.path.exists(dir_path): print("Creating {0}".format(dir_path)) os.makedirs(dir_path) - else: + else: print("{0} already exists".format(dir_path)) def create_file(filepath, contents, overwrite=False): @@ -49,4 +51,132 @@ def create_file(filepath, contents, overwrite=False): text_file.write(contents) text_file.close() else: - print("{0} already exists.".format(filepath)) \ No newline at end of file + print("{0} already exists.".format(filepath)) + +""" +Implementation of a simple cross-platform file locking mechanism. +This is a modified version of code retrieved on 2013-01-01 from http://www.evanfosmark.com/2009/01/cross-platform-file-locking-support-in-python. +The original code was released under the BSD License, as is this modified version. + +Modifications in this version: + - Tweak docstrings for sphinx. + - Accept an absolute path for the protected file (instead of a file name relative to cwd). + - Allow timeout to be None. + - Fixed a bug that caused the original code to be NON-threadsafe when the same FileLock instance was shared by multiple threads in one process. + (The original was safe for multiple processes, but not multiple threads in a single process. This version is safe for both cases.) + - Added ``purge()`` function. + - Added ``available()`` function. + - Expanded API to mimic ``threading.Lock interface``: + - ``__enter__`` always calls ``acquire()``, and therefore blocks if ``acquire()`` was called previously. + - ``__exit__`` always calls ``release()``. It is therefore a bug to call ``release()`` from within a context manager. + - Added ``locked()`` function. + - Added blocking parameter to ``acquire()`` method + +WARNINGS: + - The locking mechanism used here may need to be changed to support NFS filesystems: + http://lwn.net/Articles/251004 + - This code has not been thoroughly tested on Windows, and there has been one report of incorrect results on Windows XP and Windows 7. + The locking mechanism used in this class should (in theory) be cross-platform, but use at your own risk. +""" + +class FileLock(object): + """ A file locking mechanism that has context-manager support so + you can use it in a ``with`` statement. This should be relatively cross + compatible as it doesn't rely on ``msvcrt`` or ``fcntl`` for the locking. + """ + + class FileLockException(Exception): + pass + + def __init__(self, protected_file_path, timeout=None, delay=1, lock_file_contents=None): + """ Prepare the file locker. Specify the file to lock and optionally + the maximum timeout and the delay between each attempt to lock. + """ + self.is_locked = False + self.lockfile = protected_file_path + ".lock" + self.timeout = timeout + self.delay = delay + self._lock_file_contents = lock_file_contents + if self._lock_file_contents is None: + self._lock_file_contents = "Owning process args:\n" + for arg in sys.argv: + self._lock_file_contents += arg + "\n" + + def locked(self): + """ + Returns True iff the file is owned by THIS FileLock instance. + (Even if this returns false, the file could be owned by another FileLock instance, possibly in a different thread or process). + """ + return self.is_locked + + def available(self): + """ + Returns True iff the file is currently available to be locked. + """ + return not os.path.exists(self.lockfile) + + def acquire(self, blocking=True): + """ Acquire the lock, if possible. If the lock is in use, and `blocking` is False, return False. + Otherwise, check again every `self.delay` seconds until it either gets the lock or + exceeds `timeout` number of seconds, in which case it raises an exception. + """ + start_time = time.time() + while True: + try: + # Attempt to create the lockfile. + # These flags cause os.open to raise an OSError if the file already exists. + fd = os.open( self.lockfile, os.O_CREAT | os.O_EXCL | os.O_RDWR ) + with os.fdopen( fd, 'a' ) as f: + # Print some info about the current process as debug info for anyone who bothers to look. + f.write( self._lock_file_contents ) + break; + except OSError as e: + if e.errno != errno.EEXIST: + raise + if self.timeout is not None and (time.time() - start_time) >= self.timeout: + raise FileLock.FileLockException("Timeout occurred.") + if not blocking: + return False + time.sleep(self.delay) + self.is_locked = True + return True + + def release(self): + """ Get rid of the lock by deleting the lockfile. + When working in a `with` statement, this gets automatically + called at the end. + """ + self.is_locked = False + os.unlink(self.lockfile) + + + def __enter__(self): + """ Activated when used in the with statement. + Should automatically acquire a lock to be used in the with block. + """ + self.acquire() + return self + + + def __exit__(self, type, value, traceback): + """ Activated at the end of the with statement. + It automatically releases the lock if it isn't locked. + """ + self.release() + + + def __del__(self): + """ Make sure this ``FileLock`` instance doesn't leave a .lock file + lying around. + """ + if self.is_locked: + self.release() + + def purge(self): + """ + For debug purposes only. Removes the lock file from the hard disk. + """ + if os.path.exists(self.lockfile): + self.release() + return True + return False diff --git a/utils/test_utils.py b/utils/test_utils.py index 2e3b0c9..aadcda1 100644 --- a/utils/test_utils.py +++ b/utils/test_utils.py @@ -33,6 +33,10 @@ from six import u +from operator import itemgetter +from __future__ import generators + + def generate_timestamped_string(subject="test", number_of_random_chars=4): """ Generate time-stamped string. Format as follows... @@ -73,3 +77,172 @@ def generate_random_string(number_of_random_chars=8, character_set=string.ascii_ """ return u('').join(random.choice(character_set) for _ in range(number_of_random_chars)) + +# Sample algorithm implementation +# Full detail given here: https://github.com/Pent00/YenKSP + + +# Computes K-Shortest Paths using Yen's Algorithm. +# +# Yen's algorithm computes single-source K-shortest loopless paths for a graph +# with non-negative edge cost. The algorithm was published by Jin Y. Yen in 1971 +# and implores any shortest path algorithm to find the best path, then proceeds +# to find K-1 deviations of the best path. + +## Computes K paths from a source to a sink in the supplied graph. +# +# @param graph A digraph of class Graph. +# @param start The source node of the graph. +# @param sink The sink node of the graph. +# @param K The amount of paths being computed. +# +# @retval [] Array of paths, where [0] is the shortest, [1] is the next +# shortest, and so on. +# +def ksp_yen(graph, node_start, node_end, max_k=2): + distances, previous = dijkstra(graph, node_start) + + A = [{'cost': distances[node_end], + 'path': path(previous, node_start, node_end)}] + B = [] + + if not A[0]['path']: return A + + for k in range(1, max_k): + for i in range(0, len(A[-1]['path']) - 1): + node_spur = A[-1]['path'][i] + path_root = A[-1]['path'][:i+1] + + edges_removed = [] + for path_k in A: + curr_path = path_k['path'] + if len(curr_path) > i and path_root == curr_path[:i+1]: + cost = graph.remove_edge(curr_path[i], curr_path[i+1]) + if cost == -1: + continue + edges_removed.append([curr_path[i], curr_path[i+1], cost]) + + path_spur = dijkstra(graph, node_spur, node_end) + + if path_spur['path']: + path_total = path_root[:-1] + path_spur['path'] + dist_total = distances[node_spur] + path_spur['cost'] + potential_k = {'cost': dist_total, 'path': path_total} + + if not (potential_k in B): + B.append(potential_k) + + for edge in edges_removed: + graph.add_edge(edge[0], edge[1], edge[2]) + + if len(B): + B = sorted(B, key=itemgetter('cost')) + A.append(B[0]) + B.pop(0) + else: + break + + return A + +## Computes the shortest path from a source to a sink in the supplied graph. +# +# @param graph A digraph of class Graph. +# @param node_start The source node of the graph. +# @param node_end The sink node of the graph. +# +# @retval {} Dictionary of path and cost or if the node_end is not specified, +# the distances and previous lists are returned. +# +def dijkstra(graph, node_start, node_end=None): + distances = {} + previous = {} + Q = priorityDictionary() + + for v in graph: + distances[v] = graph.INFINITY + previous[v] = graph.UNDEFINDED + Q[v] = graph.INFINITY + + distances[node_start] = 0 + Q[node_start] = 0 + + for v in Q: + if v == node_end: break + + for u in graph[v]: + cost_vu = distances[v] + graph[v][u] + + if cost_vu < distances[u]: + distances[u] = cost_vu + Q[u] = cost_vu + previous[u] = v + + if node_end: + return {'cost': distances[node_end], + 'path': path(previous, node_start, node_end)} + else: + return (distances, previous) + + +class priorityDictionary(dict): + def __init__(self): + '''Initialize priorityDictionary by creating binary heap of pairs + (value,key). Note that changing or removing a dict entry will not + remove the old pair from the heap until it is found by smallest() or + until the heap is rebuilt.''' + self.__heap = [] + dict.__init__(self) + + def smallest(self): + '''Find smallest item after removing deleted items from heap.''' + if len(self) == 0: + raise IndexError, "smallest of empty priorityDictionary" + heap = self.__heap + while heap[0][1] not in self or self[heap[0][1]] != heap[0][0]: + lastItem = heap.pop() + insertionPoint = 0 + while 1: + smallChild = 2*insertionPoint+1 + if smallChild+1 < len(heap) and \ + heap[smallChild] > heap[smallChild+1]: + smallChild += 1 + if smallChild >= len(heap) or lastItem <= heap[smallChild]: + heap[insertionPoint] = lastItem + break + heap[insertionPoint] = heap[smallChild] + insertionPoint = smallChild + return heap[0][1] + + def __iter__(self): + '''Create destructive sorted iterator of priorityDictionary.''' + def iterfn(): + while len(self) > 0: + x = self.smallest() + yield x + del self[x] + return iterfn() + + def __setitem__(self,key,val): + '''Change value stored in dictionary and add corresponding pair to heap. + Rebuilds the heap if the number of deleted items grows too large, to + avoid memory leakage.''' + dict.__setitem__(self,key,val) + heap = self.__heap + if len(heap) > 2 * len(self): + self.__heap = [(v,k) for k,v in self.iteritems()] + self.__heap.sort() # builtin sort likely faster than O(n) heapify + else: + newPair = (val,key) + insertionPoint = len(heap) + heap.append(None) + while insertionPoint > 0 and \ + newPair < heap[(insertionPoint-1)//2]: + heap[insertionPoint] = heap[(insertionPoint-1)//2] + insertionPoint = (insertionPoint-1)//2 + heap[insertionPoint] = newPair + + def setdefault(self,key,val): + '''Reimplement setdefault to call our customized __setitem__.''' + if key not in self: + self[key] = val + return self[key] diff --git a/webapp/static/css/font-awesome/css/font-awesome.min.css b/webapp/static/css/font-awesome/css/font-awesome.min.css deleted file mode 100644 index 449d6ac..0000000 --- a/webapp/static/css/font-awesome/css/font-awesome.min.css +++ /dev/null @@ -1,4 +0,0 @@ -/*! - * Font Awesome 4.0.3 by @davegandy - http://fontawesome.io - @fontawesome - * License - http://fontawesome.io/license (Font: SIL OFL 1.1, CSS: MIT License) - */@font-face{font-family:'FontAwesome';src:url('../fonts/fontawesome-webfont.eot?v=4.0.3');src:url('../fonts/fontawesome-webfont.eot?#iefix&v=4.0.3') format('embedded-opentype'),url('../fonts/fontawesome-webfont.woff?v=4.0.3') format('woff'),url('../fonts/fontawesome-webfont.ttf?v=4.0.3') format('truetype'),url('../fonts/fontawesome-webfont.svg?v=4.0.3#fontawesomeregular') format('svg');font-weight:normal;font-style:normal}.fa{display:inline-block;font-family:FontAwesome;font-style:normal;font-weight:normal;line-height:1;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale}.fa-lg{font-size:1.3333333333333333em;line-height:.75em;vertical-align:-15%}.fa-2x{font-size:2em}.fa-3x{font-size:3em}.fa-4x{font-size:4em}.fa-5x{font-size:5em}.fa-fw{width:1.2857142857142858em;text-align:center}.fa-ul{padding-left:0;margin-left:2.142857142857143em;list-style-type:none}.fa-ul>li{position:relative}.fa-li{position:absolute;left:-2.142857142857143em;width:2.142857142857143em;top:.14285714285714285em;text-align:center}.fa-li.fa-lg{left:-1.8571428571428572em}.fa-border{padding:.2em .25em .15em;border:solid .08em #eee;border-radius:.1em}.pull-right{float:right}.pull-left{float:left}.fa.pull-left{margin-right:.3em}.fa.pull-right{margin-left:.3em}.fa-spin{-webkit-animation:spin 2s infinite linear;-moz-animation:spin 2s infinite linear;-o-animation:spin 2s infinite linear;animation:spin 2s infinite linear}@-moz-keyframes spin{0%{-moz-transform:rotate(0deg)}100%{-moz-transform:rotate(359deg)}}@-webkit-keyframes spin{0%{-webkit-transform:rotate(0deg)}100%{-webkit-transform:rotate(359deg)}}@-o-keyframes spin{0%{-o-transform:rotate(0deg)}100%{-o-transform:rotate(359deg)}}@-ms-keyframes spin{0%{-ms-transform:rotate(0deg)}100%{-ms-transform:rotate(359deg)}}@keyframes spin{0%{transform:rotate(0deg)}100%{transform:rotate(359deg)}}.fa-rotate-90{filter:progid:DXImageTransform.Microsoft.BasicImage(rotation=1);-webkit-transform:rotate(90deg);-moz-transform:rotate(90deg);-ms-transform:rotate(90deg);-o-transform:rotate(90deg);transform:rotate(90deg)}.fa-rotate-180{filter:progid:DXImageTransform.Microsoft.BasicImage(rotation=2);-webkit-transform:rotate(180deg);-moz-transform:rotate(180deg);-ms-transform:rotate(180deg);-o-transform:rotate(180deg);transform:rotate(180deg)}.fa-rotate-270{filter:progid:DXImageTransform.Microsoft.BasicImage(rotation=3);-webkit-transform:rotate(270deg);-moz-transform:rotate(270deg);-ms-transform:rotate(270deg);-o-transform:rotate(270deg);transform:rotate(270deg)}.fa-flip-horizontal{filter:progid:DXImageTransform.Microsoft.BasicImage(rotation=0,mirror=1);-webkit-transform:scale(-1,1);-moz-transform:scale(-1,1);-ms-transform:scale(-1,1);-o-transform:scale(-1,1);transform:scale(-1,1)}.fa-flip-vertical{filter:progid:DXImageTransform.Microsoft.BasicImage(rotation=2,mirror=1);-webkit-transform:scale(1,-1);-moz-transform:scale(1,-1);-ms-transform:scale(1,-1);-o-transform:scale(1,-1);transform:scale(1,-1)}.fa-stack{position:relative;display:inline-block;width:2em;height:2em;line-height:2em;vertical-align:middle}.fa-stack-1x,.fa-stack-2x{position:absolute;left:0;width:100%;text-align:center}.fa-stack-1x{line-height:inherit}.fa-stack-2x{font-size:2em}.fa-inverse{color:#fff}.fa-glass:before{content:"\f000"}.fa-music:before{content:"\f001"}.fa-search:before{content:"\f002"}.fa-envelope-o:before{content:"\f003"}.fa-heart:before{content:"\f004"}.fa-star:before{content:"\f005"}.fa-star-o:before{content:"\f006"}.fa-user:before{content:"\f007"}.fa-film:before{content:"\f008"}.fa-th-large:before{content:"\f009"}.fa-th:before{content:"\f00a"}.fa-th-list:before{content:"\f00b"}.fa-check:before{content:"\f00c"}.fa-times:before{content:"\f00d"}.fa-search-plus:before{content:"\f00e"}.fa-search-minus:before{content:"\f010"}.fa-power-off:before{content:"\f011"}.fa-signal:before{content:"\f012"}.fa-gear:before,.fa-cog:before{content:"\f013"}.fa-trash-o:before{content:"\f014"}.fa-home:before{content:"\f015"}.fa-file-o:before{content:"\f016"}.fa-clock-o:before{content:"\f017"}.fa-road:before{content:"\f018"}.fa-download:before{content:"\f019"}.fa-arrow-circle-o-down:before{content:"\f01a"}.fa-arrow-circle-o-up:before{content:"\f01b"}.fa-inbox:before{content:"\f01c"}.fa-play-circle-o:before{content:"\f01d"}.fa-rotate-right:before,.fa-repeat:before{content:"\f01e"}.fa-refresh:before{content:"\f021"}.fa-list-alt:before{content:"\f022"}.fa-lock:before{content:"\f023"}.fa-flag:before{content:"\f024"}.fa-headphones:before{content:"\f025"}.fa-volume-off:before{content:"\f026"}.fa-volume-down:before{content:"\f027"}.fa-volume-up:before{content:"\f028"}.fa-qrcode:before{content:"\f029"}.fa-barcode:before{content:"\f02a"}.fa-tag:before{content:"\f02b"}.fa-tags:before{content:"\f02c"}.fa-book:before{content:"\f02d"}.fa-bookmark:before{content:"\f02e"}.fa-print:before{content:"\f02f"}.fa-camera:before{content:"\f030"}.fa-font:before{content:"\f031"}.fa-bold:before{content:"\f032"}.fa-italic:before{content:"\f033"}.fa-text-height:before{content:"\f034"}.fa-text-width:before{content:"\f035"}.fa-align-left:before{content:"\f036"}.fa-align-center:before{content:"\f037"}.fa-align-right:before{content:"\f038"}.fa-align-justify:before{content:"\f039"}.fa-list:before{content:"\f03a"}.fa-dedent:before,.fa-outdent:before{content:"\f03b"}.fa-indent:before{content:"\f03c"}.fa-video-camera:before{content:"\f03d"}.fa-picture-o:before{content:"\f03e"}.fa-pencil:before{content:"\f040"}.fa-map-marker:before{content:"\f041"}.fa-adjust:before{content:"\f042"}.fa-tint:before{content:"\f043"}.fa-edit:before,.fa-pencil-square-o:before{content:"\f044"}.fa-share-square-o:before{content:"\f045"}.fa-check-square-o:before{content:"\f046"}.fa-arrows:before{content:"\f047"}.fa-step-backward:before{content:"\f048"}.fa-fast-backward:before{content:"\f049"}.fa-backward:before{content:"\f04a"}.fa-play:before{content:"\f04b"}.fa-pause:before{content:"\f04c"}.fa-stop:before{content:"\f04d"}.fa-forward:before{content:"\f04e"}.fa-fast-forward:before{content:"\f050"}.fa-step-forward:before{content:"\f051"}.fa-eject:before{content:"\f052"}.fa-chevron-left:before{content:"\f053"}.fa-chevron-right:before{content:"\f054"}.fa-plus-circle:before{content:"\f055"}.fa-minus-circle:before{content:"\f056"}.fa-times-circle:before{content:"\f057"}.fa-check-circle:before{content:"\f058"}.fa-question-circle:before{content:"\f059"}.fa-info-circle:before{content:"\f05a"}.fa-crosshairs:before{content:"\f05b"}.fa-times-circle-o:before{content:"\f05c"}.fa-check-circle-o:before{content:"\f05d"}.fa-ban:before{content:"\f05e"}.fa-arrow-left:before{content:"\f060"}.fa-arrow-right:before{content:"\f061"}.fa-arrow-up:before{content:"\f062"}.fa-arrow-down:before{content:"\f063"}.fa-mail-forward:before,.fa-share:before{content:"\f064"}.fa-expand:before{content:"\f065"}.fa-compress:before{content:"\f066"}.fa-plus:before{content:"\f067"}.fa-minus:before{content:"\f068"}.fa-asterisk:before{content:"\f069"}.fa-exclamation-circle:before{content:"\f06a"}.fa-gift:before{content:"\f06b"}.fa-leaf:before{content:"\f06c"}.fa-fire:before{content:"\f06d"}.fa-eye:before{content:"\f06e"}.fa-eye-slash:before{content:"\f070"}.fa-warning:before,.fa-exclamation-triangle:before{content:"\f071"}.fa-plane:before{content:"\f072"}.fa-calendar:before{content:"\f073"}.fa-random:before{content:"\f074"}.fa-comment:before{content:"\f075"}.fa-magnet:before{content:"\f076"}.fa-chevron-up:before{content:"\f077"}.fa-chevron-down:before{content:"\f078"}.fa-retweet:before{content:"\f079"}.fa-shopping-cart:before{content:"\f07a"}.fa-folder:before{content:"\f07b"}.fa-folder-open:before{content:"\f07c"}.fa-arrows-v:before{content:"\f07d"}.fa-arrows-h:before{content:"\f07e"}.fa-bar-chart-o:before{content:"\f080"}.fa-twitter-square:before{content:"\f081"}.fa-facebook-square:before{content:"\f082"}.fa-camera-retro:before{content:"\f083"}.fa-key:before{content:"\f084"}.fa-gears:before,.fa-cogs:before{content:"\f085"}.fa-comments:before{content:"\f086"}.fa-thumbs-o-up:before{content:"\f087"}.fa-thumbs-o-down:before{content:"\f088"}.fa-star-half:before{content:"\f089"}.fa-heart-o:before{content:"\f08a"}.fa-sign-out:before{content:"\f08b"}.fa-linkedin-square:before{content:"\f08c"}.fa-thumb-tack:before{content:"\f08d"}.fa-external-link:before{content:"\f08e"}.fa-sign-in:before{content:"\f090"}.fa-trophy:before{content:"\f091"}.fa-github-square:before{content:"\f092"}.fa-upload:before{content:"\f093"}.fa-lemon-o:before{content:"\f094"}.fa-phone:before{content:"\f095"}.fa-square-o:before{content:"\f096"}.fa-bookmark-o:before{content:"\f097"}.fa-phone-square:before{content:"\f098"}.fa-twitter:before{content:"\f099"}.fa-facebook:before{content:"\f09a"}.fa-github:before{content:"\f09b"}.fa-unlock:before{content:"\f09c"}.fa-credit-card:before{content:"\f09d"}.fa-rss:before{content:"\f09e"}.fa-hdd-o:before{content:"\f0a0"}.fa-bullhorn:before{content:"\f0a1"}.fa-bell:before{content:"\f0f3"}.fa-certificate:before{content:"\f0a3"}.fa-hand-o-right:before{content:"\f0a4"}.fa-hand-o-left:before{content:"\f0a5"}.fa-hand-o-up:before{content:"\f0a6"}.fa-hand-o-down:before{content:"\f0a7"}.fa-arrow-circle-left:before{content:"\f0a8"}.fa-arrow-circle-right:before{content:"\f0a9"}.fa-arrow-circle-up:before{content:"\f0aa"}.fa-arrow-circle-down:before{content:"\f0ab"}.fa-globe:before{content:"\f0ac"}.fa-wrench:before{content:"\f0ad"}.fa-tasks:before{content:"\f0ae"}.fa-filter:before{content:"\f0b0"}.fa-briefcase:before{content:"\f0b1"}.fa-arrows-alt:before{content:"\f0b2"}.fa-group:before,.fa-users:before{content:"\f0c0"}.fa-chain:before,.fa-link:before{content:"\f0c1"}.fa-cloud:before{content:"\f0c2"}.fa-flask:before{content:"\f0c3"}.fa-cut:before,.fa-scissors:before{content:"\f0c4"}.fa-copy:before,.fa-files-o:before{content:"\f0c5"}.fa-paperclip:before{content:"\f0c6"}.fa-save:before,.fa-floppy-o:before{content:"\f0c7"}.fa-square:before{content:"\f0c8"}.fa-bars:before{content:"\f0c9"}.fa-list-ul:before{content:"\f0ca"}.fa-list-ol:before{content:"\f0cb"}.fa-strikethrough:before{content:"\f0cc"}.fa-underline:before{content:"\f0cd"}.fa-table:before{content:"\f0ce"}.fa-magic:before{content:"\f0d0"}.fa-truck:before{content:"\f0d1"}.fa-pinterest:before{content:"\f0d2"}.fa-pinterest-square:before{content:"\f0d3"}.fa-google-plus-square:before{content:"\f0d4"}.fa-google-plus:before{content:"\f0d5"}.fa-money:before{content:"\f0d6"}.fa-caret-down:before{content:"\f0d7"}.fa-caret-up:before{content:"\f0d8"}.fa-caret-left:before{content:"\f0d9"}.fa-caret-right:before{content:"\f0da"}.fa-columns:before{content:"\f0db"}.fa-unsorted:before,.fa-sort:before{content:"\f0dc"}.fa-sort-down:before,.fa-sort-asc:before{content:"\f0dd"}.fa-sort-up:before,.fa-sort-desc:before{content:"\f0de"}.fa-envelope:before{content:"\f0e0"}.fa-linkedin:before{content:"\f0e1"}.fa-rotate-left:before,.fa-undo:before{content:"\f0e2"}.fa-legal:before,.fa-gavel:before{content:"\f0e3"}.fa-dashboard:before,.fa-tachometer:before{content:"\f0e4"}.fa-comment-o:before{content:"\f0e5"}.fa-comments-o:before{content:"\f0e6"}.fa-flash:before,.fa-bolt:before{content:"\f0e7"}.fa-sitemap:before{content:"\f0e8"}.fa-umbrella:before{content:"\f0e9"}.fa-paste:before,.fa-clipboard:before{content:"\f0ea"}.fa-lightbulb-o:before{content:"\f0eb"}.fa-exchange:before{content:"\f0ec"}.fa-cloud-download:before{content:"\f0ed"}.fa-cloud-upload:before{content:"\f0ee"}.fa-user-md:before{content:"\f0f0"}.fa-stethoscope:before{content:"\f0f1"}.fa-suitcase:before{content:"\f0f2"}.fa-bell-o:before{content:"\f0a2"}.fa-coffee:before{content:"\f0f4"}.fa-cutlery:before{content:"\f0f5"}.fa-file-text-o:before{content:"\f0f6"}.fa-building-o:before{content:"\f0f7"}.fa-hospital-o:before{content:"\f0f8"}.fa-ambulance:before{content:"\f0f9"}.fa-medkit:before{content:"\f0fa"}.fa-fighter-jet:before{content:"\f0fb"}.fa-beer:before{content:"\f0fc"}.fa-h-square:before{content:"\f0fd"}.fa-plus-square:before{content:"\f0fe"}.fa-angle-double-left:before{content:"\f100"}.fa-angle-double-right:before{content:"\f101"}.fa-angle-double-up:before{content:"\f102"}.fa-angle-double-down:before{content:"\f103"}.fa-angle-left:before{content:"\f104"}.fa-angle-right:before{content:"\f105"}.fa-angle-up:before{content:"\f106"}.fa-angle-down:before{content:"\f107"}.fa-desktop:before{content:"\f108"}.fa-laptop:before{content:"\f109"}.fa-tablet:before{content:"\f10a"}.fa-mobile-phone:before,.fa-mobile:before{content:"\f10b"}.fa-circle-o:before{content:"\f10c"}.fa-quote-left:before{content:"\f10d"}.fa-quote-right:before{content:"\f10e"}.fa-spinner:before{content:"\f110"}.fa-circle:before{content:"\f111"}.fa-mail-reply:before,.fa-reply:before{content:"\f112"}.fa-github-alt:before{content:"\f113"}.fa-folder-o:before{content:"\f114"}.fa-folder-open-o:before{content:"\f115"}.fa-smile-o:before{content:"\f118"}.fa-frown-o:before{content:"\f119"}.fa-meh-o:before{content:"\f11a"}.fa-gamepad:before{content:"\f11b"}.fa-keyboard-o:before{content:"\f11c"}.fa-flag-o:before{content:"\f11d"}.fa-flag-checkered:before{content:"\f11e"}.fa-terminal:before{content:"\f120"}.fa-code:before{content:"\f121"}.fa-reply-all:before{content:"\f122"}.fa-mail-reply-all:before{content:"\f122"}.fa-star-half-empty:before,.fa-star-half-full:before,.fa-star-half-o:before{content:"\f123"}.fa-location-arrow:before{content:"\f124"}.fa-crop:before{content:"\f125"}.fa-code-fork:before{content:"\f126"}.fa-unlink:before,.fa-chain-broken:before{content:"\f127"}.fa-question:before{content:"\f128"}.fa-info:before{content:"\f129"}.fa-exclamation:before{content:"\f12a"}.fa-superscript:before{content:"\f12b"}.fa-subscript:before{content:"\f12c"}.fa-eraser:before{content:"\f12d"}.fa-puzzle-piece:before{content:"\f12e"}.fa-microphone:before{content:"\f130"}.fa-microphone-slash:before{content:"\f131"}.fa-shield:before{content:"\f132"}.fa-calendar-o:before{content:"\f133"}.fa-fire-extinguisher:before{content:"\f134"}.fa-rocket:before{content:"\f135"}.fa-maxcdn:before{content:"\f136"}.fa-chevron-circle-left:before{content:"\f137"}.fa-chevron-circle-right:before{content:"\f138"}.fa-chevron-circle-up:before{content:"\f139"}.fa-chevron-circle-down:before{content:"\f13a"}.fa-html5:before{content:"\f13b"}.fa-css3:before{content:"\f13c"}.fa-anchor:before{content:"\f13d"}.fa-unlock-alt:before{content:"\f13e"}.fa-bullseye:before{content:"\f140"}.fa-ellipsis-h:before{content:"\f141"}.fa-ellipsis-v:before{content:"\f142"}.fa-rss-square:before{content:"\f143"}.fa-play-circle:before{content:"\f144"}.fa-ticket:before{content:"\f145"}.fa-minus-square:before{content:"\f146"}.fa-minus-square-o:before{content:"\f147"}.fa-level-up:before{content:"\f148"}.fa-level-down:before{content:"\f149"}.fa-check-square:before{content:"\f14a"}.fa-pencil-square:before{content:"\f14b"}.fa-external-link-square:before{content:"\f14c"}.fa-share-square:before{content:"\f14d"}.fa-compass:before{content:"\f14e"}.fa-toggle-down:before,.fa-caret-square-o-down:before{content:"\f150"}.fa-toggle-up:before,.fa-caret-square-o-up:before{content:"\f151"}.fa-toggle-right:before,.fa-caret-square-o-right:before{content:"\f152"}.fa-euro:before,.fa-eur:before{content:"\f153"}.fa-gbp:before{content:"\f154"}.fa-dollar:before,.fa-usd:before{content:"\f155"}.fa-rupee:before,.fa-inr:before{content:"\f156"}.fa-cny:before,.fa-rmb:before,.fa-yen:before,.fa-jpy:before{content:"\f157"}.fa-ruble:before,.fa-rouble:before,.fa-rub:before{content:"\f158"}.fa-won:before,.fa-krw:before{content:"\f159"}.fa-bitcoin:before,.fa-btc:before{content:"\f15a"}.fa-file:before{content:"\f15b"}.fa-file-text:before{content:"\f15c"}.fa-sort-alpha-asc:before{content:"\f15d"}.fa-sort-alpha-desc:before{content:"\f15e"}.fa-sort-amount-asc:before{content:"\f160"}.fa-sort-amount-desc:before{content:"\f161"}.fa-sort-numeric-asc:before{content:"\f162"}.fa-sort-numeric-desc:before{content:"\f163"}.fa-thumbs-up:before{content:"\f164"}.fa-thumbs-down:before{content:"\f165"}.fa-youtube-square:before{content:"\f166"}.fa-youtube:before{content:"\f167"}.fa-xing:before{content:"\f168"}.fa-xing-square:before{content:"\f169"}.fa-youtube-play:before{content:"\f16a"}.fa-dropbox:before{content:"\f16b"}.fa-stack-overflow:before{content:"\f16c"}.fa-instagram:before{content:"\f16d"}.fa-flickr:before{content:"\f16e"}.fa-adn:before{content:"\f170"}.fa-bitbucket:before{content:"\f171"}.fa-bitbucket-square:before{content:"\f172"}.fa-tumblr:before{content:"\f173"}.fa-tumblr-square:before{content:"\f174"}.fa-long-arrow-down:before{content:"\f175"}.fa-long-arrow-up:before{content:"\f176"}.fa-long-arrow-left:before{content:"\f177"}.fa-long-arrow-right:before{content:"\f178"}.fa-apple:before{content:"\f179"}.fa-windows:before{content:"\f17a"}.fa-android:before{content:"\f17b"}.fa-linux:before{content:"\f17c"}.fa-dribbble:before{content:"\f17d"}.fa-skype:before{content:"\f17e"}.fa-foursquare:before{content:"\f180"}.fa-trello:before{content:"\f181"}.fa-female:before{content:"\f182"}.fa-male:before{content:"\f183"}.fa-gittip:before{content:"\f184"}.fa-sun-o:before{content:"\f185"}.fa-moon-o:before{content:"\f186"}.fa-archive:before{content:"\f187"}.fa-bug:before{content:"\f188"}.fa-vk:before{content:"\f189"}.fa-weibo:before{content:"\f18a"}.fa-renren:before{content:"\f18b"}.fa-pagelines:before{content:"\f18c"}.fa-stack-exchange:before{content:"\f18d"}.fa-arrow-circle-o-right:before{content:"\f18e"}.fa-arrow-circle-o-left:before{content:"\f190"}.fa-toggle-left:before,.fa-caret-square-o-left:before{content:"\f191"}.fa-dot-circle-o:before{content:"\f192"}.fa-wheelchair:before{content:"\f193"}.fa-vimeo-square:before{content:"\f194"}.fa-turkish-lira:before,.fa-try:before{content:"\f195"}.fa-plus-square-o:before{content:"\f196"} \ No newline at end of file diff --git a/webapp/static/js/graphbuilder.js b/webapp/static/js/graphbuilder.js index 4abfd20..1f418a9 100644 --- a/webapp/static/js/graphbuilder.js +++ b/webapp/static/js/graphbuilder.js @@ -28,10 +28,10 @@ graphics.node(function(node) { var img = Viva.Graph.svg('image') .attr('width', nodeSize) .attr('height', nodeSize) - .link(node.data.img); + .link(node.data.img); $(img).dblclick(function() { window.open(node.data.url, '_blank');}); ui.append(svgText); - ui.append(img); + ui.append(img); stroke = Viva.Graph.svg('rect') .attr("style", "fill:none;stroke-width:1;stroke:black;") .attr('width', nodeSize+1) @@ -39,18 +39,18 @@ graphics.node(function(node) { ui.append(stroke); return ui; }).placeNode(function(nodeUI, pos) { - nodeUI.attr('transform', - 'translate(' + - (pos.x - nodeSize/2) + ',' + (pos.y - nodeSize/2) + + nodeUI.attr('transform', + 'translate(' + + (pos.x - nodeSize/2) + ',' + (pos.y - nodeSize/2) + ')'); -}); +}); // To render an arrow we have to address two problems: -// 1. Links should start/stop at node's bounding box, not at the node center. +// 1. Links should start/stop at node's bounding box, not at the node center. // 2. Render an arrow shape at the end of the link. -// Rendering arrow shape is achieved by using SVG markers, part of the SVG +// Rendering arrow shape is achieved by using SVG markers, part of the SVG // standard: http://www.w3.org/TR/SVG/painting.html#Markers var createMarker = function(id) { return Viva.Graph.svg('marker') @@ -71,7 +71,7 @@ marker.append('path').attr('d', 'M 0 0 L 10 5 L 0 10 z').attr('stroke', 'grey'); var defs = graphics.getSvgRoot().append('defs'); defs.append(marker); -var geom = Viva.Graph.geom(); +var geom = Viva.Graph.geom(); graphics.link(function(link){ // Notice the Triangle marker-end attribe: @@ -86,7 +86,7 @@ graphics.link(function(link){ }); return path; }).placeLink(function(linkUI, fromPos, toPos) { - // Here we should take care about + // Here we should take care about // "Links should start/stop at node's bounding box, not at the node center." // For rectangular nodes Viva.Graph.geom() provides efficient way to find @@ -101,7 +101,7 @@ graphics.link(function(link){ fromPos.x + fromNodeSize / 2, // right fromPos.y + fromNodeSize / 2, // bottom // segment: - fromPos.x, fromPos.y, toPos.x, toPos.y) + fromPos.x, fromPos.y, toPos.x, toPos.y) || fromPos; // if no intersection found - return center of the node var to = geom.intersectRect( @@ -111,11 +111,12 @@ graphics.link(function(link){ toPos.x + toNodeSize / 2, // right toPos.y + toNodeSize / 2, // bottom // segment: - toPos.x, toPos.y, fromPos.x, fromPos.y) + toPos.x, toPos.y, fromPos.x, fromPos.y) || toPos; // if no intersection found - return center of the node var data = 'M' + from.x + ',' + from.y + 'L' + to.x + ',' + to.y; linkUI.attr("d", data); - }); \ No newline at end of file + }); + diff --git a/webapp/templates/base.html b/webapp/templates/base.html index 6b971d0..585693c 100644 --- a/webapp/templates/base.html +++ b/webapp/templates/base.html @@ -9,7 +9,7 @@ - + diff --git a/webapp/templates/configuration.html b/webapp/templates/configuration.html index e69de29..ad9c9f5 100644 --- a/webapp/templates/configuration.html +++ b/webapp/templates/configuration.html @@ -0,0 +1,84 @@ +{% extends base.html %} + +{% block title %}Configuration Manager{% end %} + +{% block content %} + +

    +
    + +
    +
    + +
    + +
    +
    + +
    +
    +
    +
    + + +{% end %}