diff --git a/maryam.py b/maryam.py index 9711e28e4..ca1e1c3c6 100644 --- a/maryam.py +++ b/maryam.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 from maryam import __main__ import sys diff --git a/maryam/core/util/iris/meta_search_util.py b/maryam/core/util/iris/meta_search_util.py index ab0793518..f9a3bf323 100644 --- a/maryam/core/util/iris/meta_search_util.py +++ b/maryam/core/util/iris/meta_search_util.py @@ -15,9 +15,11 @@ from math import trunc class main: + def __init__(self): self.framework = main.framework + def make_cite(self, url: 'URL string') -> 'cite': urlib = self.framework.urlib(url) path = urlib.path @@ -40,6 +42,7 @@ def make_cite(self, url: 'URL string') -> 'cite': cite = f"{host}{path}" return cite + def remove_dups(self, res): urls = [] new = [] @@ -50,6 +53,7 @@ def remove_dups(self, res): new.append(i) return new + def simple_merge(results) -> 'merging results based on quality of engines': engines_len = len(results) merged = [] @@ -65,14 +69,16 @@ def simple_merge(results) -> 'merging results based on quality of engines': return merged + def compute_count_consensus( e: dict(type=list, help='list of search engines sorted by quality'), - l: dict(type=int, help='number of results')) -> 'a list of numbers': + l: dict(type=int, help='number of results'), + ) -> 'a list of numbers': x = len(e) o = {} for i in e: - o[i] = trunc(l/x) - l -= l - (l%x) + o[i] = trunc(l / x) + l -= l - (l % x) if l != 0: if l < x: for i in range(l): diff --git a/maryam/core/util/iris/retriever.py b/maryam/core/util/iris/retriever.py index cf7d69a21..fcba65d6b 100644 --- a/maryam/core/util/iris/retriever.py +++ b/maryam/core/util/iris/retriever.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 """ OWASP Maryam! This program is free software: you can redistribute it and/or modify @@ -87,7 +86,7 @@ def __init__( lowercase=True, preprocessor=None, tokenizer=None, - stop_words="english", + stop_words='english', token_pattern=r"(?u)\b\w\w+\b", ngram_range=(1, 2), max_df=1, @@ -124,11 +123,13 @@ def __init__( ) self.vectorizer = vectorizer + def fit(self, df, y=None): self.metadata = df self.tfidf_matrix = self.vectorizer.fit_transform(list(map(' '.join, df["pages"]))) return self + def predict(self, query: str) -> 'OrderedDict': """ Compute the top_n closest documents given a query @@ -160,10 +161,8 @@ def predict(self, query: str) -> 'OrderedDict': table = prettytable.PrettyTable(["rank", "index", "title"]) for i in range(len(closest_docs_indices)): index = closest_docs_indices[i] - # if self.paragraphs: - # article_index = self.paragraphs[int(index)]["index"] - # title = self.metadata.iloc[int(article_index)]["title"] - # else: + + title = self.metadata.iloc[int(index)]["title"] table.add_row([rank, index, title]) rank += 1 diff --git a/maryam/core/util/iris/safe_searcher.py b/maryam/core/util/iris/safe_searcher.py deleted file mode 100644 index a2d08622f..000000000 --- a/maryam/core/util/iris/safe_searcher.py +++ /dev/null @@ -1,82 +0,0 @@ -""" -OWASP Maryam! -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -any later version. -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. -You should have received a copy of the GNU General Public License -along with this program. If not, see . -""" - -from inspect import signature - -class main: - def __init__(self, engine_q=None): - self.framework = main.framework - - if engine_q is None: - self._engine_q = [ - self.framework.google, - self.framework.duckduckgo, - self.framework.bing, - self.framework.startpage, - self.framework.dogpile, - self.framework.qwant, - self.framework.yandex, - self.framework.yahoo, - self.framework.ask, - ] - else: - self._engine_q = engine_q - - self._error_record = [] - - - @property - def _get_new_errors(self): - new_errors = self.framework._error_stack[len(self._error_record):] - self._error_record = self.framework._error_stack - return new_errors - - def search(self, q, engine=None, limit=1, count=15): - if engine is None: - engine = self._engine_q.pop(0) - else: - if engine in self._engine_q: - self._engine_q.remove(engine) - - results = None - - while results is None: - sig = signature(engine.__init__) - if 'limit' in sig.parameters and 'count' in sig.parameters: - instance = engine(q, limit=limit, count=count) - elif 'limit' in sig.parameters: - instance = engine(q, limit=limit) - elif 'count' in sig.parameters: - instance = engine(q, count=count) - else: - instance = engine(q) - - instance.run_crawl() - - if hasattr(instance,'results'): - results = instance.results - else: - results = instance.links_with_title - - results = results if len(results)>0 else None - - if any('captcha' in x.lower() or 'missed' in x.lower() for x in self.framework._error_stack): - self.framework._reset_error_stack() - if len(self._engine_q) != 0: - engine = self._engine_q.pop(0) - else: - self.framework.error('All Engines Exhausted') - return - - return results diff --git a/maryam/core/util/iris/tf_histogram.py b/maryam/core/util/iris/tf_histogram.py index d80bab4e4..1a564aa5f 100644 --- a/maryam/core/util/iris/tf_histogram.py +++ b/maryam/core/util/iris/tf_histogram.py @@ -17,6 +17,7 @@ from collections import Counter BASEDIR = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../')) + class main: def __init__(self, docs: 'documents', form: 'documet form. e.g html', without_punc=True): @@ -33,18 +34,22 @@ def __init__(self, docs: 'documents', form: 'documet form. e.g html', without_pu if self.without_punc: self._punc() + def remove_stopwords(self, rest): stops = open(os.path.join(BASEDIR, '../../', 'data', 'stopwords.csv')).read().split(',') self.words = [x for x in self.words if x not in stops and x not in rest] + def _punc(self): self.words = re.findall(r"[\w\-_#]{2,}", self.docs) + def _counter(self, last): """ last: number of terms to show in plot """ bow = Counter(self.words) return bow.most_common(last) + def plot_histogram(self, title, last, should_show=False): import pandas as pd import matplotlib.pyplot as plt diff --git a/maryam/core/util/iris/topic.py b/maryam/core/util/iris/topic.py index 19cdf9161..477f18a1d 100644 --- a/maryam/core/util/iris/topic.py +++ b/maryam/core/util/iris/topic.py @@ -11,13 +11,14 @@ You should have received a copy of the GNU General Public License along with this program. If not, see . """ -# core/util/iris/topic.py # Hatma Suryotrisongko +import os import pandas as pd import numpy as np import matplotlib.pyplot as plt from sentence_transformers import SentenceTransformer +BASEDIR = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../')) class main: @@ -25,7 +26,7 @@ def __init__(self, inputfile, filetype, keyword, showcharts, verbose): from dask import dataframe as dd import json - from gensim.parsing.preprocessing import remove_stopwords + self.stops = open(os.path.join(BASEDIR, '../../', 'data', 'stopwords.csv')).read().split(',') if verbose == True: print("\n\n DATASET = reading file : " + inputfile) @@ -42,7 +43,7 @@ def __init__(self, inputfile, filetype, keyword, showcharts, verbose): print("\n\n csv file (before preprocessing) = ") print(tmp4) - self.corpus = tmp4[0].str.lower().apply(remove_stopwords).to_numpy() + self.corpus = tmp4[0].str.lower().apply(self.remove_stopwords).to_numpy() elif filetype == "json": with open(inputfile) as json_file: @@ -55,7 +56,7 @@ def __init__(self, inputfile, filetype, keyword, showcharts, verbose): print(tmp) tmp['td'] = tmp['t'] + ' ' + tmp['d'] - self.corpus = tmp['td'].str.lower().apply(remove_stopwords).to_numpy() + self.corpus = tmp['td'].str.lower().apply(self.remove_stopwords).to_numpy() else: print('ERROR, only accept csv or json file!') @@ -73,6 +74,11 @@ def __init__(self, inputfile, filetype, keyword, showcharts, verbose): pd.Series([len(e.split()) for e in self.corpus]).hist() plt.show() + + def remove_stopwords(self, text): + return ''.join([x for x in text if x not in self.stops]) + + def run_sklearn_cluster_kmeans(self, selected_pretrained_model, showcharts, verbose): from sklearn.cluster import KMeans diff --git a/maryam/core/util/iris/word_cloud.py b/maryam/core/util/iris/word_cloud.py deleted file mode 100644 index 6f443af7d..000000000 --- a/maryam/core/util/iris/word_cloud.py +++ /dev/null @@ -1,52 +0,0 @@ -""" -OWASP Maryam! -This program is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -any later version. -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. -You should have received a copy of the GNU General Public License -along with this program. If not, see . -""" - -import re -import os -from wordcloud import WordCloud -import matplotlib.pyplot as plt - -class main: - def __init__(self): - self.framework = main.framework - - def _remove_url(self, data): - return re.sub(r"(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?", '', data) - - def plot_wcloud(self, title, docs: 'documents', form: 'documet form. e.g html', limit: 'number of words', - without_punc=True, remove_stopwords=False, should_show=True): - docs = str(docs) - docs = self._remove_url(docs) - tf = self.framework.tf_histogram(docs, form, without_punc) - - if remove_stopwords: - tf.remove_stopwords() - bow = tf._counter(limit) - cloud_data = ' '.join(i[0] for i in bow) - - if not cloud_data: - self.framework.error('NoDataToPrintError.', 'util/iris/word_cloud', 'plot_wcloud') - return False - - wcd = WordCloud().generate(cloud_data) - plt.imshow(wcd, interpolation='bilinear') - plt.axis("off") - plt.title(title) - filename = os.path.join(self.framework.workspace,title.replace(' ','_')+'.png') - plt.savefig(filename, format="png") - if should_show: - plt.show() - - return filename - diff --git a/maryam/modules/iris/iris.py b/maryam/modules/iris/iris.py index 6e9b2c369..c2e7298e6 100755 --- a/maryam/modules/iris/iris.py +++ b/maryam/modules/iris/iris.py @@ -21,7 +21,7 @@ 'description': 'Iris is a built-in meta search engine.', 'comments': ('It should be note that this is a beta version and has many bugs!',), 'contributors': 'Aman, Dimitris, Divya, Vikas, Kunal', - 'sources': ('google', 'bing', 'duckduckgo', 'etools', 'startpage', 'searx', 'yahoo'), + 'sources': ('google', 'bing', 'duckduckgo', 'etools', 'startpage', 'yahoo'), 'options': ( ('query', None, True, 'Query string', '-q', 'store', str), ), diff --git a/maryam/modules/iris/iris_cluster.py b/maryam/modules/iris/iris_cluster.py index 054e096c7..05443e4b6 100755 --- a/maryam/modules/iris/iris_cluster.py +++ b/maryam/modules/iris/iris_cluster.py @@ -13,11 +13,11 @@ """ meta = { - 'name': 'Iris_Cluster', + 'name': 'Iris Cluster', 'author': 'Shaad', 'version': '0.1', 'description': 'Get Iris Search result and clustered results for your query', - 'required': ('kneed', 'mlxtend, numpy, sklearn'), + 'required': ('kneed', 'mlxtend', 'numpy', 'sklearn'), 'options': ( ('query', None, True, 'Query string', '-q', 'store', str), ), @@ -51,7 +51,7 @@ def module_run(self): print('\n\nCLUSTER RESULT: ') for index, title in enumerate(output): - print('\n') - print(f"CLUSTER {index+1}") - print(f"TITLE: {title}") - print(' '+'\n '.join(output[title])) + print('\n') + print(f"CLUSTER {index+1}") + print(f"TITLE: {title}") + print(' '+'\n '.join(output[title])) diff --git a/maryam/modules/iris/sentiment.py b/maryam/modules/iris/sentiment.py index 253216270..dc2367da5 100755 --- a/maryam/modules/iris/sentiment.py +++ b/maryam/modules/iris/sentiment.py @@ -70,7 +70,7 @@ def module_api(self): return DATA = loads(file.read()) if key not in DATA and key != None: - self.error("The key doesn't exists", 'module_api', 'iris/sentiment') + self.error('The key doesn\'t exists', 'module_api', 'iris/sentiment') return if key != None: DATA = DATA[key] diff --git a/maryam/modules/iris/topicmodeling.py b/maryam/modules/iris/topicmodeling.py index 61d6f5d2e..3818e0225 100755 --- a/maryam/modules/iris/topicmodeling.py +++ b/maryam/modules/iris/topicmodeling.py @@ -60,4 +60,4 @@ def module_api(self): def module_run(self): output = module_api(self) self.output("\n\nOutput = \n") - self.output( output ) \ No newline at end of file + self.output( output ) diff --git a/requirements b/requirements index 5b06f8d64..b12bf0c36 100644 --- a/requirements +++ b/requirements @@ -1,55 +1,21 @@ beautifulsoup4 bertopic -bs4 certifi -charset-normalizer -click cloudscraper -Cython dask Flask -gensim huggingface-hub -Jinja2 -joblib -locket lxml matplotlib -mpmath numpy packaging pandas -partd -Pillow -plotly -pynndescent -pyparsing -python-dateutil -pytz -PyYAML -regex requests -requests-toolbelt scikit-learn -scipy sentence-transformers -sentencepiece -six -smart-open -soupsieve -sympy -tenacity -threadpoolctl -tokenizers -toolz top2vec -tqdm transformers typing-extensions umap umap-learn -urllib3 vaderSentiment -Werkzeug -wordcloud -zipp diff --git a/setup.py b/setup.py index 54ce643f7..59f92e715 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python3 """ OWASP Maryam! @@ -34,25 +33,27 @@ keywords=['OWASP', 'OSINT', 'search-engine', 'social-networks', 'Maryam'], scripts=['bin/maryam'], install_requires=[ - 'requests', + 'beautifulsoup4', + 'bertopic', + 'certifi', 'cloudscraper', - 'bs4', + 'dask', + 'Flask', + 'huggingface-hub', 'lxml', - 'flask', - 'vaderSentiment', - 'plotly', 'matplotlib', - 'pandas', - 'wordcloud', 'numpy', - 'dask', + 'packaging', + 'pandas', + 'requests', 'scikit-learn', - 'scipy', + 'sentence-transformers', + 'top2vec', + 'transformers', + 'typing-extensions', 'umap', - 'bertopic', - 'sentence_transformers', - 'gensim', - 'top2vec' + 'umap-learn', + 'vaderSentiment', ], classifiers=[ 'Programming Language :: Python :: 3.10',