Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix dependencies #85

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added libs/libcalg.so
Binary file not shown.
Binary file added libs/libcalg.so.0.0.0
Binary file not shown.
24 changes: 12 additions & 12 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
scrapy
pybloomfilter
requests
bs4
twisted==16.6.0
w3lib
lxml
six
cssselect
pyopenssl
cryptography
queuelib
scrapy==1.8.4
pybloomfilter==1.0
requests==2.27.1
beautifulsoup4==4.9.3
twisted==20.3.0
w3lib==1.22.0
lxml==5.0.1
six==1.16.0
cssselect==1.1.0
pyopenssl==21.0.0
cryptography==3.3.2
queuelib==1.5.0
119 changes: 119 additions & 0 deletions xsscrapy/bloomfilter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
# pybloomfilter.py
#
# Copyright 2009 ahmed youssef <[email protected]>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
# MA 02110-1301, USA.



__all__=["BloomFilter"]


from ctypes import *
import os

lib_path = os.path.abspath("libs/libcalg.so")
calg = CDLL(lib_path)
if not calg:
print "Error loading libcalg.so"
exit(1)

string_hash=calg.string_hash
string_hash.restype=c_ulong
string_hash.argstype=[c_void_p]

int_hash=calg.int_hash
int_hash.restype=c_ulong
int_hash.argstype=[c_void_p]

# bloomfilter c-definitions

bloomfilter_value=c_void_p
HASH_FUNC=CFUNCTYPE(c_ulong, bloomfilter_value)

class BloomFilterStruct(Structure):
__fields__=[
("hash_func", HASH_FUNC),
("table", POINTER(c_ubyte)),
("table_size", c_uint ),
("num_functions",c_uint ),
]

bloomfilter_p = POINTER(BloomFilterStruct)

bf_new = calg.bloom_filter_new
bf_new.restype = bloomfilter_p
bf_new.argstype = [c_uint, HASH_FUNC, c_uint]

bf_free=calg.bloom_filter_free
bf_free.restype=None
bf_free.argstype=[bloomfilter_p]

bf_insert = calg.bloom_filter_insert
bf_insert.restype = None
bf_insert.argstype = [bloomfilter_p, bloomfilter_value]

bf_query = calg.bloom_filter_query
bf_query.restype = c_int
bf_query.argstype = [bloomfilter_p, bloomfilter_value]

# python wrapper

class BloomFilter:

def __init__(self, table_size=128, hash_func=string_hash, num_functions=1):
"""
A bloom filter is a space efficient data structure that can be used to test whether a given element is part of a set.
Lookups will occasionally generate false positives, but never false negatives.
"""
self._bloomfilter = bf_new(table_size, hash_func, num_functions)

def insert(self, val):
"""
Insert a value into the bloom filter.
"""
bf_insert(self._bloomfilter, str(val))

def query(self, val):
"""
Query a bloom filter for a particular value.
"""
return bf_query(self._bloomfilter, str(val))

def __contains__(self, val):
"""
Check if a value is in the bloom filter.
"""
return self.query(val)

def __del__(self):
"""
Explicitly free the resources allocated by the bloom filter.
"""
if self._bloomfilter:
bf_free(self._bloomfilter)


if __name__=="__main__":

b=BloomFilter()
b.insert("ahmed")
b.insert("ayman")
print "ahmed" in b
print "ayman" in b
print "memo" in b

del b
6 changes: 3 additions & 3 deletions xsscrapy/bloomfilters.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from pybloomfilter import BloomFilter
from xsscrapy.bloomfilter import BloomFilter
from scrapy.utils.job import job_dir
from scrapy.dupefilters import BaseDupeFilter
from xsscrapy.settings import bloomfilterSize
Expand All @@ -8,7 +8,7 @@ class BloomURLDupeFilter(BaseDupeFilter):

def __init__(self, path=None):
self.file = None
self.fingerprints = BloomFilter(bloomfilterSize*10, 0.0001)
self.fingerprints = BloomFilter(bloomfilterSize)

@classmethod
def from_settings(cls, settings):
Expand All @@ -18,7 +18,7 @@ def request_seen(self, request):
fp = request.url
if fp in self.fingerprints:
return True
self.fingerprints.add(fp)
self.fingerprints.insert(fp)

def close(self, reason):
self.fingerprints = None
44 changes: 23 additions & 21 deletions xsscrapy/middlewares.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,24 @@
from scrapy.exceptions import IgnoreRequest
from urllib.parse import unquote
from pybloomfilter import BloomFilter
from urlparse import unquote
from xsscrapy.bloomfilter import BloomFilter
import random
import re
from xsscrapy.settings import bloomfilterSize

# Filter out duplicate requests with Bloom filters since they're much easier on memory
#URLS_FORMS_HEADERS = BloomFilter(3000000, 0.00001)
URLS_SEEN = BloomFilter(bloomfilterSize, .0001)
FORMS_SEEN = BloomFilter(bloomfilterSize, .0001)
HEADERS_SEEN = BloomFilter(bloomfilterSize, .0001)
USER_AGENT_LIST = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20100101 Firefox/29.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:28.0) Gecko/20100101 Firefox/28.0']

class RandomUserAgentMiddleware(object):
''' Use a random user-agent for each request '''

USER_AGENT_LIST = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20100101 Firefox/29.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:28.0) Gecko/20100101 Firefox/28.0'
]

def process_request(self, request, spider):
ua = random.choice(USER_AGENT_LIST)
ua = random.choice(RandomUserAgentMiddleware.USER_AGENT_LIST)
if 'payload' in request.meta:
payload = request.meta['payload']
if 'User-Agent' in request.headers:
Expand All @@ -33,6 +31,10 @@ def process_request(self, request, spider):
class InjectedDupeFilter(object):
''' Filter duplicate payloaded URLs, headers, and forms since all of those have dont_filter = True '''

URLS_SEEN = BloomFilter(bloomfilterSize)
FORMS_SEEN = BloomFilter(bloomfilterSize)
HEADERS_SEEN = BloomFilter(bloomfilterSize)

def process_request(self, request, spider):

meta = request.meta
Expand All @@ -46,21 +48,21 @@ def process_request(self, request, spider):
#replace the delim characters with nothing so we only test the URL
#with the payload
no_delim_url = url.replace(delim, '')
if no_delim_url in URLS_SEEN:
if no_delim_url in InjectedDupeFilter.URLS_SEEN:
raise IgnoreRequest
spider.log('Sending payloaded URL: %s' % url)
URLS_SEEN.add(url)
InjectedDupeFilter.URLS_SEEN.insert(no_delim_url)
return

# Injected form dupe handling
elif meta['xss_place'] == 'form':
u = meta['POST_to']
p = meta['xss_param']
u_p = (u, p)
if u_p in FORMS_SEEN:
if u_p in InjectedDupeFilter.FORMS_SEEN:
raise IgnoreRequest
spider.log('Sending payloaded form param %s to: %s' % (p, u))
FORMS_SEEN.add(u_p)
InjectedDupeFilter.FORMS_SEEN.insert(u_p)
return

# Injected header dupe handling
Expand All @@ -69,8 +71,8 @@ def process_request(self, request, spider):
h = meta['xss_param']
# URL, changed header, payload
u_h = (u, h)
if u_h in HEADERS_SEEN:
if u_h in InjectedDupeFilter.HEADERS_SEEN:
raise IgnoreRequest
spider.log('Sending payloaded %s header' % h)
HEADERS_SEEN.add(u_h)
InjectedDupeFilter.HEADERS_SEEN.insert(u_h)
return
6 changes: 3 additions & 3 deletions xsscrapy/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,16 @@
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.exceptions import DropItem
from html.parser import HTMLParser
from xsscrapy.items import vuln#, inj_resp
import HTMLParser
import re
import lxml.etree
import lxml.html
from lxml.html import soupparser, fromstring
import itertools
#from IPython import embed
from socket import gaierror, gethostbyname
from urllib.parse import urlparse
from urlparse import urlparse
from logging import CRITICAL, ERROR, WARNING, INFO, DEBUG

class XSSCharFinder(object):
Expand All @@ -26,7 +26,7 @@ def get_filename(self, url):
filename = up + '.txt'

return filename

def open_spider(self, spider):
self.filename = self.get_filename(spider.url)

Expand Down