diff --git a/richa/__pycache__/query_processing.cpython-38.pyc b/richa/__pycache__/query_processing.cpython-38.pyc new file mode 100644 index 00000000..069705d1 Binary files /dev/null and b/richa/__pycache__/query_processing.cpython-38.pyc differ diff --git a/richa/__pycache__/ranking.cpython-38.pyc b/richa/__pycache__/ranking.cpython-38.pyc new file mode 100644 index 00000000..65f65614 Binary files /dev/null and b/richa/__pycache__/ranking.cpython-38.pyc differ diff --git a/richa/app.py b/richa/app.py new file mode 100644 index 00000000..4f1737de --- /dev/null +++ b/richa/app.py @@ -0,0 +1,78 @@ +from flask import Flask, render_template, request +import pymongo +import os +from flask_paginate import Pagination, get_page_args +from ranking import Ranking +from query_processing import QueryProcessing +import time + + +app = Flask(__name__) + + +@app.route('/') +def entry_point(): + return render_template('home.html') + + +@app.route('/search_results') +def search_results(): + connect_url = 'mongodb://127.0.0.1:27017/' + + client = pymongo.MongoClient(connect_url, connect=False) + + db = client.results + + search_string = request.args.get('search') + + processor = QueryProcessing(search_string) + keywords = processor.processor() + + query = [] + + start = time.time() + + for keyword in keywords: + query.extend(db.search_results.find( + {'$text': {'$search': keyword, '$caseSensitive': False}})) + + end = time.time() + print(f"time to execute: {end-start}") + + search_result = [] + + for doc in query: + exist = False + for result in search_result: + if result['title'] == doc['title'] or result['url'] == doc['url']: + exist = True + break + + if exist == False: + search_result.append(doc) + + rank = Ranking(search_result, search_string) + + ranked_result = rank.sorted_results() + + client.close() + + page, per_page, offset = get_page_args(page_parameter='page', + per_page_parameter='per_page') + + total = len(ranked_result) + + pagination = Pagination(page=page, per_page=per_page, total=total, + css_framework='bootstrap4') + + return render_template('search.html', + search_result=ranked_result[offset:offset+per_page], + page=page, + per_page=per_page, + pagination=pagination, + search_string=search_string + ) + + +if __name__ == '__main__': + app.run(debug=True) \ No newline at end of file diff --git a/richa/crawler/__pycache__/popular_links.cpython-38.pyc b/richa/crawler/__pycache__/popular_links.cpython-38.pyc new file mode 100644 index 00000000..cd61faf8 Binary files /dev/null and b/richa/crawler/__pycache__/popular_links.cpython-38.pyc differ diff --git a/richa/crawler/crawler.py b/richa/crawler/crawler.py new file mode 100644 index 00000000..da394b04 --- /dev/null +++ b/richa/crawler/crawler.py @@ -0,0 +1,118 @@ +from bs4 import BeautifulSoup +import requests +import pymongo +import os +import urllib.parse +from popular_links import Popularity +import sys + + +class Crawler(): + connect_url = 'mongodb://127.0.0.1:27017/' + + client = pymongo.MongoClient(connect_url) + + db = client.results + + search_results = [] + + url_count = 1 + + def start_crawl(self, url, depth): + robot_url = urllib.parse.urljoin(url, '/robots.txt') + try: + robots = requests.get(robot_url) + except BaseException: + print("robots not found") + self.crawl(url, depth) + + soup = BeautifulSoup(robots.text, 'lxml') + + sauce = soup.find('p').text + + content = sauce.split() + + disallowed_links = [] + + for word in content: + if word[0] == '/': + disallowed_links.append(urllib.parse.urljoin(url, word)) + elif 'http' in word: + disallowed_links.append(word) + print("got robots!!!") + + self.crawl(url, depth, disallowed_links) + + def crawl(self, url, depth, *disallowed_links): + + try: + print(f'Crawling url {self.url_count}: {url} at depth: {depth}') + self.url_count += 1 + response = requests.get(url) + + except BaseException: + print(f'Failed to perform HTTP GET request on {url}') + return + + soup = BeautifulSoup(response.text, 'lxml') + + try: + title = soup.find('title').text + description = '' + + for tag in soup.findAll(): + if tag.name == 'p': + description += tag.text.strip().replace('\n', '') + + except BaseException: + print("Failed to retrieve title and description\n") + return + + popularity = Popularity(url) + popularity_score = popularity.popularity_score() + + query = { + 'url': url, + 'title': title, + 'description': description, + 'score': 0, + 'popularity': popularity_score, + } + + search_results = self.db.search_results + + search_results.insert_one(query) + + search_results.create_index([ + ('url', pymongo.TEXT), + ('title', pymongo.TEXT), + ('description', pymongo.TEXT), + ('score', 1), + ('popularity', 1) + ], name='search_results', default_language='english') + + if depth == 0: + return + + links = soup.findAll('a') + + for link in links: + try: + if link['href'] not in disallowed_links[0]: + if 'http' in link['href']: + self.crawl(link['href'], depth -1 + , disallowed_links[0]) + else: + link['href'] = urllib.parse.urljoin(url, link['href']) + self.crawl(link['href'], depth-1, disallowed_links[0]) + except KeyError: + print("no links to retrieve in the website entered!!!") + pass + + self.client.close() + + +spider = Crawler() + +spider.start_crawl( + sys.argv[1], int(sys.argv[2])) \ No newline at end of file diff --git a/richa/crawler/popular_links.py b/richa/crawler/popular_links.py new file mode 100644 index 00000000..64f3eb63 --- /dev/null +++ b/richa/crawler/popular_links.py @@ -0,0 +1,18 @@ +class Popularity(): + popular_domains = [ + 'https://pypi.org/', 'https://www.indiatoday.in/', + ] + + ps = 0 + + def __init__(self, url): + self.url = url + + def popularity_score(self): + for domain in self.popular_domains: + if domain == self.url: + self.ps += 100/len(self.popular_domains) + if domain in self.url: + self.ps += 100/len(self.popular_domains) + + return self.ps \ No newline at end of file diff --git a/richa/query_processing.py b/richa/query_processing.py new file mode 100644 index 00000000..858b390c --- /dev/null +++ b/richa/query_processing.py @@ -0,0 +1,37 @@ +import nltk +from nltk.corpus import stopwords +nltk.download('punkt') +nltk.download('stopwords') +from nltk.tokenize import word_tokenize +from nltk.stem.porter import PorterStemmer +from spellchecker import SpellChecker +import string + + + +class QueryProcessing(): + def __init__(self, search_string): + self.search_string = search_string + + def processor(self): + old_query = self.search_string + + self.search_string = self.search_string.lower() + + translator = str.maketrans('', '', string.punctuation) + self.search_string = self.search_string.translate(translator) + + self.search_string = " ".join(self.search_string.split()) + + stop_words = set(stopwords.words("english")) + word_tokens = word_tokenize(self.search_string) + tokens = [word for word in word_tokens if word not in stop_words] + + stemmer = PorterStemmer() + tokens = [stemmer.stem(word) for word in tokens] + + spell = SpellChecker() + for i in range(len(tokens)): + tokens[i] = spell.correction(tokens[i]) + + return tokens \ No newline at end of file diff --git a/richa/ranking.py b/richa/ranking.py new file mode 100644 index 00000000..1590fc3e --- /dev/null +++ b/richa/ranking.py @@ -0,0 +1,61 @@ +from operator import itemgetter +import string + + +class Ranking: + def __init__(self, results, query): + self.results = results + self.query = query + + def search(self): + res = [] + filtered = [] + if '"' in self.query: + x = '"' + y = ' ' + z = ' " ' + mytable = self.query.maketrans(x, y, z) + res.insert(0, self.query.translate(mytable)) + else: + if ':' in self.query: # filter by url search query => query:url + key = self.query.split(':')[0] + fil = self.query.split(':')[1] + print(key) + print(fil) + for result in self.results: + if fil.lower() in result['url'].lower(): + filtered.append(result) + self.results = filtered + elif '-' in self.query: + key = self.query.split('-')[0] + fil = self.query.split('-')[1] + for result in self.results: + if fil.lower() not in result['title'].lower() or fil.lower() not in result['description'].lower(): + filtered.append(result) + self.results = filtered + else: + key = self.query + + res = key.split() + return res + + def ranked_results(self): + + keywords = self.search() + for key in keywords: + for result in self.results: + if key.lower() in result['title'].lower(): + result['score'] += 2 + if key.lower() in result['description'].lower(): + result['score'] += 1 + + return self.results + + def sorted_results(self): + + ranked_searches = self.ranked_results() + + sorted_searches = sorted( + ranked_searches, key=itemgetter('popularity', 'score'), reverse=True) + + return sorted_searches \ No newline at end of file diff --git a/richa/readme.md b/richa/readme.md new file mode 100644 index 00000000..01755b0a --- /dev/null +++ b/richa/readme.md @@ -0,0 +1,18 @@ +# Glugle +### A search engine which shows results fetched from a few websites + + +### Demo + +![Alt Text](static/glugle.gif)
+ +![Alt Text](static/results.png) + +### Tech Stack used: +> Python
+> Mondodb
+> Flask + +### Future Aspects + +> It can be improved furthur by adding voice based searches and user log-in. diff --git a/richa/static/css/style.css b/richa/static/css/style.css new file mode 100644 index 00000000..63da678d --- /dev/null +++ b/richa/static/css/style.css @@ -0,0 +1,116 @@ +body { + + background-color: white; + color: rgb(48, 46, 46); + font-size: 25px; + } + + .dark-mode { + background-color: rgb(48, 46, 46); + color: white; + } + +.nav-link active svg{ + size: 20px; +} +:root { + --navHeight: 30px; + } + + + + .switch { + width: 55px; + display: flex; + align-items: center; + justify-content: space-between; + } + + .switch div { + position: relative; + display: block; + background: #eee; + width: 69px; + border-radius: 50px; + padding: 0 5px; + box-sizing: border-box; + cursor: pointer; + } + + .fa-adjust { + transform: rotate(180deg); + } + + .switch input { + display: none; + } + + .slider { + background-color: rgb(166, 174, 221); + transition: 0.4s; + border-radius: 34px; + height: 15px; + width: 20px; + display: inline-block; + position: relative; + } + + input:checked + .slider { + transform: translateX(16px); + } + + nav { + background: #d2cca1; + box-shadow: 0 0 4px rgba(0, 0, 0, 0.3); + font-size: 1.1rem; + position: relative; + } + + nav ul { + list-style-type: none; + } + + + + nav ul li { + /* padding: 12px 10px; */ + cursor: pointer; + transition: background 0.3s ease; + border-radius: 4px; + position: relative; + } + + + + + + + .take_query{ + border-radius: 20px; + outline:none; + border: 1.5px solid rgb(31, 31, 32); + width: 500px; +} +.take_query :hover{ + outline: blue; + border: 3px solid white; + +} + +.button_submit{ + display: inline-block; + background-color: rgb(88, 91, 105); + border-radius: 20px; + border:none; + outline: none !important; + margin-top:20px; color: white; + padding: 8px; + + +} + +.button_submit:hover{ + transition: 0.3s all ease-in-out; + background-color:rgb(181, 182, 187); + color: black; +} \ No newline at end of file diff --git a/richa/static/glugle.gif b/richa/static/glugle.gif new file mode 100644 index 00000000..2fa850ea Binary files /dev/null and b/richa/static/glugle.gif differ diff --git a/richa/static/results.png b/richa/static/results.png new file mode 100644 index 00000000..0aef0cbb Binary files /dev/null and b/richa/static/results.png differ diff --git a/richa/templates/base.html b/richa/templates/base.html new file mode 100644 index 00000000..d0d1b66f --- /dev/null +++ b/richa/templates/base.html @@ -0,0 +1,59 @@ + + + + + + + Glugle + + + + + + + + + + + + +{% block content %} +{% endblock %} + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/richa/templates/home.html b/richa/templates/home.html new file mode 100644 index 00000000..911fd7e2 --- /dev/null +++ b/richa/templates/home.html @@ -0,0 +1,23 @@ +{% extends 'base.html'%} + +{% block content %} + + +
+
+

GLUGLE +

+
+ + +
+ + +
+

+ +
+
+ +{% endblock %} \ No newline at end of file diff --git a/richa/templates/search.html b/richa/templates/search.html new file mode 100644 index 00000000..d7893542 --- /dev/null +++ b/richa/templates/search.html @@ -0,0 +1,44 @@ +{% extends 'base.html' %} + +{% block content %} + +
+
+
+ + + + + + +
+
+ +
+

Showing results for '{{search_string}}'

+
+ + {% if search_result %} + {% for link in search_result %} +
+ + + +
+ {{ link.url }} +

{{ link.description[:300] }}...

+
+
+ {% endfor %} + {% else %} + No results found + {% endif %} + +
+ {{pagination.links}} +
+
+ +{% endblock %} \ No newline at end of file