diff --git a/maryam.py b/maryam.py
index 9711e28e4..ca1e1c3c6 100644
--- a/maryam.py
+++ b/maryam.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python3
from maryam import __main__
import sys
diff --git a/maryam/core/util/iris/meta_search_util.py b/maryam/core/util/iris/meta_search_util.py
index ab0793518..f9a3bf323 100644
--- a/maryam/core/util/iris/meta_search_util.py
+++ b/maryam/core/util/iris/meta_search_util.py
@@ -15,9 +15,11 @@
from math import trunc
class main:
+
def __init__(self):
self.framework = main.framework
+
def make_cite(self, url: 'URL string') -> 'cite':
urlib = self.framework.urlib(url)
path = urlib.path
@@ -40,6 +42,7 @@ def make_cite(self, url: 'URL string') -> 'cite':
cite = f"{host}{path}"
return cite
+
def remove_dups(self, res):
urls = []
new = []
@@ -50,6 +53,7 @@ def remove_dups(self, res):
new.append(i)
return new
+
def simple_merge(results) -> 'merging results based on quality of engines':
engines_len = len(results)
merged = []
@@ -65,14 +69,16 @@ def simple_merge(results) -> 'merging results based on quality of engines':
return merged
+
def compute_count_consensus(
e: dict(type=list, help='list of search engines sorted by quality'),
- l: dict(type=int, help='number of results')) -> 'a list of numbers':
+ l: dict(type=int, help='number of results'),
+ ) -> 'a list of numbers':
x = len(e)
o = {}
for i in e:
- o[i] = trunc(l/x)
- l -= l - (l%x)
+ o[i] = trunc(l / x)
+ l -= l - (l % x)
if l != 0:
if l < x:
for i in range(l):
diff --git a/maryam/core/util/iris/retriever.py b/maryam/core/util/iris/retriever.py
index cf7d69a21..fcba65d6b 100644
--- a/maryam/core/util/iris/retriever.py
+++ b/maryam/core/util/iris/retriever.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python3
"""
OWASP Maryam!
This program is free software: you can redistribute it and/or modify
@@ -87,7 +86,7 @@ def __init__(
lowercase=True,
preprocessor=None,
tokenizer=None,
- stop_words="english",
+ stop_words='english',
token_pattern=r"(?u)\b\w\w+\b",
ngram_range=(1, 2),
max_df=1,
@@ -124,11 +123,13 @@ def __init__(
)
self.vectorizer = vectorizer
+
def fit(self, df, y=None):
self.metadata = df
self.tfidf_matrix = self.vectorizer.fit_transform(list(map(' '.join, df["pages"])))
return self
+
def predict(self, query: str) -> 'OrderedDict':
"""
Compute the top_n closest documents given a query
@@ -160,10 +161,8 @@ def predict(self, query: str) -> 'OrderedDict':
table = prettytable.PrettyTable(["rank", "index", "title"])
for i in range(len(closest_docs_indices)):
index = closest_docs_indices[i]
- # if self.paragraphs:
- # article_index = self.paragraphs[int(index)]["index"]
- # title = self.metadata.iloc[int(article_index)]["title"]
- # else:
+
+
title = self.metadata.iloc[int(index)]["title"]
table.add_row([rank, index, title])
rank += 1
diff --git a/maryam/core/util/iris/safe_searcher.py b/maryam/core/util/iris/safe_searcher.py
deleted file mode 100644
index a2d08622f..000000000
--- a/maryam/core/util/iris/safe_searcher.py
+++ /dev/null
@@ -1,82 +0,0 @@
-"""
-OWASP Maryam!
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-any later version.
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-You should have received a copy of the GNU General Public License
-along with this program. If not, see .
-"""
-
-from inspect import signature
-
-class main:
- def __init__(self, engine_q=None):
- self.framework = main.framework
-
- if engine_q is None:
- self._engine_q = [
- self.framework.google,
- self.framework.duckduckgo,
- self.framework.bing,
- self.framework.startpage,
- self.framework.dogpile,
- self.framework.qwant,
- self.framework.yandex,
- self.framework.yahoo,
- self.framework.ask,
- ]
- else:
- self._engine_q = engine_q
-
- self._error_record = []
-
-
- @property
- def _get_new_errors(self):
- new_errors = self.framework._error_stack[len(self._error_record):]
- self._error_record = self.framework._error_stack
- return new_errors
-
- def search(self, q, engine=None, limit=1, count=15):
- if engine is None:
- engine = self._engine_q.pop(0)
- else:
- if engine in self._engine_q:
- self._engine_q.remove(engine)
-
- results = None
-
- while results is None:
- sig = signature(engine.__init__)
- if 'limit' in sig.parameters and 'count' in sig.parameters:
- instance = engine(q, limit=limit, count=count)
- elif 'limit' in sig.parameters:
- instance = engine(q, limit=limit)
- elif 'count' in sig.parameters:
- instance = engine(q, count=count)
- else:
- instance = engine(q)
-
- instance.run_crawl()
-
- if hasattr(instance,'results'):
- results = instance.results
- else:
- results = instance.links_with_title
-
- results = results if len(results)>0 else None
-
- if any('captcha' in x.lower() or 'missed' in x.lower() for x in self.framework._error_stack):
- self.framework._reset_error_stack()
- if len(self._engine_q) != 0:
- engine = self._engine_q.pop(0)
- else:
- self.framework.error('All Engines Exhausted')
- return
-
- return results
diff --git a/maryam/core/util/iris/tf_histogram.py b/maryam/core/util/iris/tf_histogram.py
index d80bab4e4..1a564aa5f 100644
--- a/maryam/core/util/iris/tf_histogram.py
+++ b/maryam/core/util/iris/tf_histogram.py
@@ -17,6 +17,7 @@
from collections import Counter
BASEDIR = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../'))
+
class main:
def __init__(self, docs: 'documents', form: 'documet form. e.g html', without_punc=True):
@@ -33,18 +34,22 @@ def __init__(self, docs: 'documents', form: 'documet form. e.g html', without_pu
if self.without_punc:
self._punc()
+
def remove_stopwords(self, rest):
stops = open(os.path.join(BASEDIR, '../../', 'data', 'stopwords.csv')).read().split(',')
self.words = [x for x in self.words if x not in stops and x not in rest]
+
def _punc(self):
self.words = re.findall(r"[\w\-_#]{2,}", self.docs)
+
def _counter(self, last):
""" last: number of terms to show in plot """
bow = Counter(self.words)
return bow.most_common(last)
+
def plot_histogram(self, title, last, should_show=False):
import pandas as pd
import matplotlib.pyplot as plt
diff --git a/maryam/core/util/iris/topic.py b/maryam/core/util/iris/topic.py
index 19cdf9161..477f18a1d 100644
--- a/maryam/core/util/iris/topic.py
+++ b/maryam/core/util/iris/topic.py
@@ -11,13 +11,14 @@
You should have received a copy of the GNU General Public License
along with this program. If not, see .
"""
-# core/util/iris/topic.py
# Hatma Suryotrisongko
+import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
+BASEDIR = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../'))
class main:
@@ -25,7 +26,7 @@ def __init__(self, inputfile, filetype, keyword, showcharts, verbose):
from dask import dataframe as dd
import json
- from gensim.parsing.preprocessing import remove_stopwords
+ self.stops = open(os.path.join(BASEDIR, '../../', 'data', 'stopwords.csv')).read().split(',')
if verbose == True:
print("\n\n DATASET = reading file : " + inputfile)
@@ -42,7 +43,7 @@ def __init__(self, inputfile, filetype, keyword, showcharts, verbose):
print("\n\n csv file (before preprocessing) = ")
print(tmp4)
- self.corpus = tmp4[0].str.lower().apply(remove_stopwords).to_numpy()
+ self.corpus = tmp4[0].str.lower().apply(self.remove_stopwords).to_numpy()
elif filetype == "json":
with open(inputfile) as json_file:
@@ -55,7 +56,7 @@ def __init__(self, inputfile, filetype, keyword, showcharts, verbose):
print(tmp)
tmp['td'] = tmp['t'] + ' ' + tmp['d']
- self.corpus = tmp['td'].str.lower().apply(remove_stopwords).to_numpy()
+ self.corpus = tmp['td'].str.lower().apply(self.remove_stopwords).to_numpy()
else:
print('ERROR, only accept csv or json file!')
@@ -73,6 +74,11 @@ def __init__(self, inputfile, filetype, keyword, showcharts, verbose):
pd.Series([len(e.split()) for e in self.corpus]).hist()
plt.show()
+
+ def remove_stopwords(self, text):
+ return ''.join([x for x in text if x not in self.stops])
+
+
def run_sklearn_cluster_kmeans(self, selected_pretrained_model, showcharts, verbose):
from sklearn.cluster import KMeans
diff --git a/maryam/core/util/iris/word_cloud.py b/maryam/core/util/iris/word_cloud.py
deleted file mode 100644
index 6f443af7d..000000000
--- a/maryam/core/util/iris/word_cloud.py
+++ /dev/null
@@ -1,52 +0,0 @@
-"""
-OWASP Maryam!
-This program is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation, either version 3 of the License, or
-any later version.
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-You should have received a copy of the GNU General Public License
-along with this program. If not, see .
-"""
-
-import re
-import os
-from wordcloud import WordCloud
-import matplotlib.pyplot as plt
-
-class main:
- def __init__(self):
- self.framework = main.framework
-
- def _remove_url(self, data):
- return re.sub(r"(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?", '', data)
-
- def plot_wcloud(self, title, docs: 'documents', form: 'documet form. e.g html', limit: 'number of words',
- without_punc=True, remove_stopwords=False, should_show=True):
- docs = str(docs)
- docs = self._remove_url(docs)
- tf = self.framework.tf_histogram(docs, form, without_punc)
-
- if remove_stopwords:
- tf.remove_stopwords()
- bow = tf._counter(limit)
- cloud_data = ' '.join(i[0] for i in bow)
-
- if not cloud_data:
- self.framework.error('NoDataToPrintError.', 'util/iris/word_cloud', 'plot_wcloud')
- return False
-
- wcd = WordCloud().generate(cloud_data)
- plt.imshow(wcd, interpolation='bilinear')
- plt.axis("off")
- plt.title(title)
- filename = os.path.join(self.framework.workspace,title.replace(' ','_')+'.png')
- plt.savefig(filename, format="png")
- if should_show:
- plt.show()
-
- return filename
-
diff --git a/maryam/modules/iris/iris.py b/maryam/modules/iris/iris.py
index 6e9b2c369..c2e7298e6 100755
--- a/maryam/modules/iris/iris.py
+++ b/maryam/modules/iris/iris.py
@@ -21,7 +21,7 @@
'description': 'Iris is a built-in meta search engine.',
'comments': ('It should be note that this is a beta version and has many bugs!',),
'contributors': 'Aman, Dimitris, Divya, Vikas, Kunal',
- 'sources': ('google', 'bing', 'duckduckgo', 'etools', 'startpage', 'searx', 'yahoo'),
+ 'sources': ('google', 'bing', 'duckduckgo', 'etools', 'startpage', 'yahoo'),
'options': (
('query', None, True, 'Query string', '-q', 'store', str),
),
diff --git a/maryam/modules/iris/iris_cluster.py b/maryam/modules/iris/iris_cluster.py
index 054e096c7..05443e4b6 100755
--- a/maryam/modules/iris/iris_cluster.py
+++ b/maryam/modules/iris/iris_cluster.py
@@ -13,11 +13,11 @@
"""
meta = {
- 'name': 'Iris_Cluster',
+ 'name': 'Iris Cluster',
'author': 'Shaad',
'version': '0.1',
'description': 'Get Iris Search result and clustered results for your query',
- 'required': ('kneed', 'mlxtend, numpy, sklearn'),
+ 'required': ('kneed', 'mlxtend', 'numpy', 'sklearn'),
'options': (
('query', None, True, 'Query string', '-q', 'store', str),
),
@@ -51,7 +51,7 @@ def module_run(self):
print('\n\nCLUSTER RESULT: ')
for index, title in enumerate(output):
- print('\n')
- print(f"CLUSTER {index+1}")
- print(f"TITLE: {title}")
- print(' '+'\n '.join(output[title]))
+ print('\n')
+ print(f"CLUSTER {index+1}")
+ print(f"TITLE: {title}")
+ print(' '+'\n '.join(output[title]))
diff --git a/maryam/modules/iris/sentiment.py b/maryam/modules/iris/sentiment.py
index 253216270..dc2367da5 100755
--- a/maryam/modules/iris/sentiment.py
+++ b/maryam/modules/iris/sentiment.py
@@ -70,7 +70,7 @@ def module_api(self):
return
DATA = loads(file.read())
if key not in DATA and key != None:
- self.error("The key doesn't exists", 'module_api', 'iris/sentiment')
+ self.error('The key doesn\'t exists', 'module_api', 'iris/sentiment')
return
if key != None:
DATA = DATA[key]
diff --git a/maryam/modules/iris/topicmodeling.py b/maryam/modules/iris/topicmodeling.py
index 61d6f5d2e..3818e0225 100755
--- a/maryam/modules/iris/topicmodeling.py
+++ b/maryam/modules/iris/topicmodeling.py
@@ -60,4 +60,4 @@ def module_api(self):
def module_run(self):
output = module_api(self)
self.output("\n\nOutput = \n")
- self.output( output )
\ No newline at end of file
+ self.output( output )
diff --git a/requirements b/requirements
index 5b06f8d64..b12bf0c36 100644
--- a/requirements
+++ b/requirements
@@ -1,55 +1,21 @@
beautifulsoup4
bertopic
-bs4
certifi
-charset-normalizer
-click
cloudscraper
-Cython
dask
Flask
-gensim
huggingface-hub
-Jinja2
-joblib
-locket
lxml
matplotlib
-mpmath
numpy
packaging
pandas
-partd
-Pillow
-plotly
-pynndescent
-pyparsing
-python-dateutil
-pytz
-PyYAML
-regex
requests
-requests-toolbelt
scikit-learn
-scipy
sentence-transformers
-sentencepiece
-six
-smart-open
-soupsieve
-sympy
-tenacity
-threadpoolctl
-tokenizers
-toolz
top2vec
-tqdm
transformers
typing-extensions
umap
umap-learn
-urllib3
vaderSentiment
-Werkzeug
-wordcloud
-zipp
diff --git a/setup.py b/setup.py
index 54ce643f7..59f92e715 100644
--- a/setup.py
+++ b/setup.py
@@ -1,4 +1,3 @@
-#!/usr/bin/env python3
"""
OWASP Maryam!
@@ -34,25 +33,27 @@
keywords=['OWASP', 'OSINT', 'search-engine', 'social-networks', 'Maryam'],
scripts=['bin/maryam'],
install_requires=[
- 'requests',
+ 'beautifulsoup4',
+ 'bertopic',
+ 'certifi',
'cloudscraper',
- 'bs4',
+ 'dask',
+ 'Flask',
+ 'huggingface-hub',
'lxml',
- 'flask',
- 'vaderSentiment',
- 'plotly',
'matplotlib',
- 'pandas',
- 'wordcloud',
'numpy',
- 'dask',
+ 'packaging',
+ 'pandas',
+ 'requests',
'scikit-learn',
- 'scipy',
+ 'sentence-transformers',
+ 'top2vec',
+ 'transformers',
+ 'typing-extensions',
'umap',
- 'bertopic',
- 'sentence_transformers',
- 'gensim',
- 'top2vec'
+ 'umap-learn',
+ 'vaderSentiment',
],
classifiers=[
'Programming Language :: Python :: 3.10',