diff --git a/prashant/README.md.txt b/prashant/README.md.txt new file mode 100644 index 00000000..7270eb48 --- /dev/null +++ b/prashant/README.md.txt @@ -0,0 +1,3 @@ +I was learning through project . It was tough for me to implement at this stage. + +I can contribute by giving idea that we can have "User-LogIn" by which search results recommendation can vary from person to person. \ No newline at end of file diff --git a/prashant/Task4/README.md.txt b/prashant/Task4/README.md.txt new file mode 100644 index 00000000..7270eb48 --- /dev/null +++ b/prashant/Task4/README.md.txt @@ -0,0 +1,3 @@ +I was learning through project . It was tough for me to implement at this stage. + +I can contribute by giving idea that we can have "User-LogIn" by which search results recommendation can vary from person to person. \ No newline at end of file diff --git a/prashant/Task4/__pycache__/app.cpython-39.pyc b/prashant/Task4/__pycache__/app.cpython-39.pyc new file mode 100644 index 00000000..069812d8 Binary files /dev/null and b/prashant/Task4/__pycache__/app.cpython-39.pyc differ diff --git a/prashant/Task4/__pycache__/query_processing.cpython-39.pyc b/prashant/Task4/__pycache__/query_processing.cpython-39.pyc new file mode 100644 index 00000000..3b415edb Binary files /dev/null and b/prashant/Task4/__pycache__/query_processing.cpython-39.pyc differ diff --git a/prashant/Task4/__pycache__/ranking.cpython-39.pyc b/prashant/Task4/__pycache__/ranking.cpython-39.pyc new file mode 100644 index 00000000..04c57a10 Binary files /dev/null and b/prashant/Task4/__pycache__/ranking.cpython-39.pyc differ diff --git a/prashant/Task4/app.py b/prashant/Task4/app.py new file mode 100644 index 00000000..bcdf75d7 --- /dev/null +++ b/prashant/Task4/app.py @@ -0,0 +1,78 @@ +from flask import Flask, render_template, request +import pymongo +import os +from flask_paginate import Pagination, get_page_args +from ranking import Ranking +from query_processing import QueryProcessing +import time + + +app = Flask(__name__) + + +@app.route('/') +def entry_point(): + return render_template('home.html') + + +@app.route('/search_results') +def search_results(): + connect_url = 'mongodb://127.0.0.1:27017/' + + client = pymongo.MongoClient(connect_url, connect=False) + + db = client.results + + search_string = request.args.get('search') + + processor = QueryProcessing(search_string) + keywords = processor.processor() + + query = [] + + start = time.time() + + for keyword in keywords: + query.extend(db.search_results.find( + {'$text': {'$search': keyword, '$caseSensitive': False}})) + + end = time.time() + print(f"time to execute: {end-start}") + + search_result = [] + + for doc in query: + exist = False + for result in search_result: + if result['title'] == doc['title'] or result['url'] == doc['url']: + exist = True + break + + if exist == False: + search_result.append(doc) + + rank = Ranking(search_result, search_string) + + ranked_result = rank.sorted_results() + + client.close() + + page, per_page, offset = get_page_args(page_parameter='page', + per_page_parameter='per_page') + + total = len(ranked_result) + + pagination = Pagination(page=page, per_page=per_page, total=total, + css_framework='bootstrap4') + + return render_template('search.html', + search_result=ranked_result[offset:offset+per_page], + page=page, + per_page=per_page, + pagination=pagination, + search_string=search_string + ) + + +if __name__ == '__main__': + app.run(debug=True, port=8000) \ No newline at end of file diff --git a/prashant/Task4/crawler/__pycache__/popular_links.cpython-39.pyc b/prashant/Task4/crawler/__pycache__/popular_links.cpython-39.pyc new file mode 100644 index 00000000..ee701107 Binary files /dev/null and b/prashant/Task4/crawler/__pycache__/popular_links.cpython-39.pyc differ diff --git a/prashant/Task4/crawler/crawler.py b/prashant/Task4/crawler/crawler.py new file mode 100644 index 00000000..5bde1046 --- /dev/null +++ b/prashant/Task4/crawler/crawler.py @@ -0,0 +1,118 @@ +from bs4 import BeautifulSoup +import requests +import pymongo +import os +import urllib.parse +from popular_links import Popularity +import sys + + +class Crawler(): + connect_url = 'mongodb://127.0.0.1:27017/' + + client = pymongo.MongoClient(connect_url) + + db = client.results + + search_results = [] + + url_count = 1 + + def start_crawl(self, url, depth): + robot_url = urllib.parse.urljoin(url, '/robots.txt') + try: + robots = requests.get(robot_url) + except BaseException: + print("robots not found") + self.crawl(url, depth) + + soup = BeautifulSoup(robots.text, 'lxml') + + sauce = soup.find('p').text + + content = sauce.split() + + disallowed_links = [] + + for word in content: + if word[0] == '/': + disallowed_links.append(urllib.parse.urljoin(url, word)) + elif 'http' in word: + disallowed_links.append(word) + print("got rebots!!!") + + self.crawl(url, depth, disallowed_links) + + def crawl(self, url, depth, *disallowed_links): + + try: + print(f'Crawling url {self.url_count}: {url} at depth: {depth}') + self.url_count += 1 + response = requests.get(url) + + except BaseException: + print(f'Failed to perform HTTP GET request on {url}') + return + + soup = BeautifulSoup(response.text, 'lxml') + + try: + title = soup.find('title').text + description = '' + + for tag in soup.findAll(): + if tag.name == 'p': + description += tag.text.strip().replace('\n', '') + + except BaseException: + print("Failed to retrieve title and description\n") + return + + popularity = Popularity(url) + popularity_score = popularity.popularity_score() + + query = { + 'url': url, + 'title': title, + 'description': description, + 'score': 0, + 'popularity': popularity_score, + } + + search_results = self.db.search_results + + search_results.insert_one(query) + + search_results.create_index([ + ('url', pymongo.TEXT), + ('title', pymongo.TEXT), + ('description', pymongo.TEXT), + ('score', 1), + ('popularity', 1) + ], name='search_results', default_language='english') + + if depth == 0: + return + + links = soup.findAll('a') + + for link in links: + try: + if link['href'] not in disallowed_links[0]: + if 'http' in link['href']: + self.crawl(link['href'], depth - + 1, disallowed_links[0]) + else: + link['href'] = urllib.parse.urljoin(url, link['href']) + self.crawl(link['href'], depth-1, disallowed_links[0]) + except KeyError: + print("no links to retrieve in the website entered!!!") + pass + + self.client.close() + + +crawler = Crawler() + +crawler.start_crawl( + sys.argv[1], int(sys.argv[2])) \ No newline at end of file diff --git a/prashant/Task4/crawler/popular_links.py b/prashant/Task4/crawler/popular_links.py new file mode 100644 index 00000000..2a27aecb --- /dev/null +++ b/prashant/Task4/crawler/popular_links.py @@ -0,0 +1,18 @@ +class Popularity(): + popular_domains = ['https://en.wikipedia.org/', 'https://www.python.org/', 'https://www.rottentomatoes.com/', + 'https://pypi.org/', 'https://www.indiatoday.in/', 'https://www.geeksforgeeks.org/', + 'https://stackoverflow.com/'] + + ps = 0 + + def __init__(self, url): + self.url = url + + def popularity_score(self): + for domain in self.popular_domains: + if domain == self.url: + self.ps += 100/len(self.popular_domains) + if domain in self.url: + self.ps += 100/len(self.popular_domains) + + return self.ps \ No newline at end of file diff --git a/prashant/Task4/query_processing.py b/prashant/Task4/query_processing.py new file mode 100644 index 00000000..2ebe5667 --- /dev/null +++ b/prashant/Task4/query_processing.py @@ -0,0 +1,34 @@ +import nltk +from nltk.corpus import stopwords +from nltk.tokenize import word_tokenize +from nltk.stem.porter import PorterStemmer +from spellchecker import SpellChecker +import string +import re + +class QueryProcessing(): + def __init__(self, search_string): + self.search_string = search_string + + def processor(self): + old_query = self.search_string + + self.search_string = self.search_string.lower() + + translator = str.maketrans('', '', string.punctuation) + self.search_string = self.search_string.translate(translator) + + self.search_string = " ".join(self.search_string.split()) + + stop_words = set(stopwords.words("english")) + word_tokens = word_tokenize(self.search_string) + tokens = [word for word in word_tokens if word not in stop_words] + + stemmer = PorterStemmer() + tokens = [stemmer.stem(word) for word in tokens] + + spell = SpellChecker() + for i in range(len(tokens)): + tokens[i] = spell.correction(tokens[i]) + + return tokens \ No newline at end of file diff --git a/prashant/Task4/ranking.py b/prashant/Task4/ranking.py new file mode 100644 index 00000000..4afe9bb1 --- /dev/null +++ b/prashant/Task4/ranking.py @@ -0,0 +1,57 @@ +from operator import itemgetter +import string + +class Ranking: + def __init__(self, results, query): + self.results = results + self.query = query + + def search(self): + res = [] + filtered = [] + if '"' in self.query: + x = '"' + y = ' ' + z = ' " ' + mytable = self.query.maketrans(x, y, z) + res.insert(0, self.query.translate(mytable)) + else: + if ':' in self.query: + key = self.query.split(':')[0] + fil = self.query.split(':')[1] + print(key) + print(fil) + for result in self.results: + if fil.lower() in result['url'].lower(): + filtered.append(result) + self.results = filtered + elif '-' in self.query: + key = self.query.split('-')[0] + fil = self.query.split('-')[1] + for result in self.results: + if fil.lower() not in result['title'].lower() or fil.lower() not in result['description'].lower(): + filtered.append(result) + self.result = filtered + else: + key = self.query + + res = key.split() + return res + def ranked_results(self): + keywords = self.search() + for key in keywords: + for result in self.results: + if key.lower() in result['title'].lower(): + result['score'] +=2 + if key.lower() in result['description'].lower(): + result['score'] +=1 + + return self.results + + def sorted_results(self): + ranked_searches = self.ranked_results() + + sorted_searches = sorted( + ranked_searches, key=itemgetter('popularity', 'score'), reverse=True) + + return sorted_searches diff --git a/prashant/Task4/static/images/ask1.png b/prashant/Task4/static/images/ask1.png new file mode 100644 index 00000000..22b63a06 Binary files /dev/null and b/prashant/Task4/static/images/ask1.png differ diff --git a/prashant/Task4/static/images/searchlogo.png b/prashant/Task4/static/images/searchlogo.png new file mode 100644 index 00000000..e0dc72f6 Binary files /dev/null and b/prashant/Task4/static/images/searchlogo.png differ diff --git a/prashant/Task4/templates/base.html b/prashant/Task4/templates/base.html new file mode 100644 index 00000000..3f534441 --- /dev/null +++ b/prashant/Task4/templates/base.html @@ -0,0 +1,32 @@ + + + + + + + + + + + GLUGLE + + + + {% block content %}{% endblock %} + + + + + \ No newline at end of file diff --git a/prashant/Task4/templates/home.html b/prashant/Task4/templates/home.html new file mode 100644 index 00000000..98786e46 --- /dev/null +++ b/prashant/Task4/templates/home.html @@ -0,0 +1,18 @@ +{% extends 'base.html' %} + +{% block content %} + +
+
+ +

Glugle Search

+
+ +
+
+ + +
+
+
+{% endblock %} \ No newline at end of file diff --git a/prashant/Task4/templates/search.html b/prashant/Task4/templates/search.html new file mode 100644 index 00000000..bbe81f57 --- /dev/null +++ b/prashant/Task4/templates/search.html @@ -0,0 +1,40 @@ +{% extends 'base.html' %} + +{% block content %} + +
+
+
+ + +
+
+ +
+

Search result for '{{search_string}}'

+
+ + {% if search_result %} + {% for link in search_result %} +
+ + + +
+ {{ link.url }} +

{{ link.description[:300] }}...

+
+
+ {% endfor %} + {% else %} + No results found + {% endif %} + +
+ {{pagination.links}} +
+
+ +{% endblock %} \ No newline at end of file