lugnitdgp · Richa812 · Oct 25, 2021 · Oct 26, 2021 · Oct 28, 2021 · Oct 29, 2021
diff --git a/richa/__pycache__/query_processing.cpython-38.pyc b/richa/__pycache__/query_processing.cpython-38.pyc
diff --git a/richa/__pycache__/ranking.cpython-38.pyc b/richa/__pycache__/ranking.cpython-38.pyc
diff --git a/richa/app.py b/richa/app.py
@@ -0,0 +1,78 @@
+from flask import Flask, render_template, request
+import pymongo
+import os
+from flask_paginate import Pagination, get_page_args
+from ranking import Ranking
+from query_processing import QueryProcessing
+import time
+
+
+app = Flask(__name__)
+
+
+@app.route('/')
+def entry_point():
+    return render_template('home.html')
+
+
+@app.route('/search_results')
+def search_results():
+    connect_url = 'mongodb://127.0.0.1:27017/'
+
+    client = pymongo.MongoClient(connect_url, connect=False)
+
+    db = client.results
+
+    search_string = request.args.get('search')
+
+    processor = QueryProcessing(search_string)
+    keywords = processor.processor()
+
+    query = []
+
+    start = time.time()
+
+    for keyword in keywords:
+        query.extend(db.search_results.find(
+            {'$text': {'$search': keyword, '$caseSensitive': False}}))
+
+    end = time.time()
+    print(f"time to execute: {end-start}")
+
+    search_result = []
+
+    for doc in query:
+        exist = False
+        for result in search_result:
+            if result['title'] == doc['title'] or result['url'] == doc['url']:
+                exist = True
+                break
+
+        if exist == False:
+            search_result.append(doc)
+
+    rank = Ranking(search_result, search_string)
+
+    ranked_result = rank.sorted_results()
+
+    client.close()
+
+    page, per_page, offset = get_page_args(page_parameter='page',
+                                           per_page_parameter='per_page')
+
+    total = len(ranked_result)
+
+    pagination = Pagination(page=page, per_page=per_page, total=total,
+                            css_framework='bootstrap4')
+
+    return render_template('search.html',
+                           search_result=ranked_result[offset:offset+per_page],
+                           page=page,
+                           per_page=per_page,
+                           pagination=pagination,
+                           search_string=search_string
+                           )
+
+
+if __name__ == '__main__':
+    app.run(debug=True)
diff --git a/richa/crawler/__pycache__/popular_links.cpython-38.pyc b/richa/crawler/__pycache__/popular_links.cpython-38.pyc
diff --git a/richa/crawler/crawler.py b/richa/crawler/crawler.py
@@ -0,0 +1,118 @@
+from bs4 import BeautifulSoup
+import requests
+import pymongo
+import os
+import urllib.parse
+from popular_links import Popularity
+import sys
+
+
+class Crawler():
+    connect_url = 'mongodb://127.0.0.1:27017/'
+
+    client = pymongo.MongoClient(connect_url)
+
+    db = client.results
+
+    search_results = []
+
+    url_count = 1
+
+    def start_crawl(self, url, depth):
+        robot_url = urllib.parse.urljoin(url, '/robots.txt')
+        try:
+            robots = requests.get(robot_url)
+        except BaseException:
+            print("robots not found")
+            self.crawl(url, depth)
+
+        soup = BeautifulSoup(robots.text, 'lxml')
+
+        sauce = soup.find('p').text
+
+        content = sauce.split()
+
+        disallowed_links = []
+
+        for word in content:
+            if word[0] == '/':
+                disallowed_links.append(urllib.parse.urljoin(url, word))
+            elif 'http' in word:
+                disallowed_links.append(word)
+        print("got robots!!!")
+
+        self.crawl(url, depth, disallowed_links)
+
+    def crawl(self, url, depth, *disallowed_links):
+
+        try:
+            print(f'Crawling url {self.url_count}: {url} at depth: {depth}')
+            self.url_count += 1
+            response = requests.get(url)
+
+        except BaseException:
+            print(f'Failed to perform HTTP GET request on {url}')
+            return
+
+        soup = BeautifulSoup(response.text, 'lxml')
+
+        try:
+            title = soup.find('title').text
+            description = ''
+
+            for tag in soup.findAll():
+                if tag.name == 'p':
+                    description += tag.text.strip().replace('\n', '')
+
+        except BaseException:
+            print("Failed to retrieve title and description\n")
+            return
+
+        popularity = Popularity(url)
+        popularity_score = popularity.popularity_score()
+
+        query = {
+            'url': url,
+            'title': title,
+            'description': description,
+            'score': 0,
+            'popularity': popularity_score,
+        }
+
+        search_results = self.db.search_results
+
+        search_results.insert_one(query)
+
+        search_results.create_index([
+            ('url', pymongo.TEXT),
+            ('title', pymongo.TEXT),
+            ('description', pymongo.TEXT),
+            ('score', 1),
+            ('popularity', 1)
+        ], name='search_results', default_language='english')
+
+        if depth == 0:
+            return
+
+        links = soup.findAll('a')
+
+        for link in links:
+            try:
+                if link['href'] not in disallowed_links[0]:
+                    if 'http' in link['href']:
+                        self.crawl(link['href'], depth -1
+                                   , disallowed_links[0])
+                    else:
+                        link['href'] = urllib.parse.urljoin(url, link['href'])
+                        self.crawl(link['href'], depth-1, disallowed_links[0])
+            except KeyError:
+                print("no links to retrieve in the website entered!!!")
+                pass
+
+        self.client.close()
+
+
+spider = Crawler()
+
+spider.start_crawl(
+    sys.argv[1], int(sys.argv[2]))
diff --git a/richa/crawler/popular_links.py b/richa/crawler/popular_links.py
@@ -0,0 +1,18 @@
+class Popularity():
+    popular_domains = [ 
+                       'https://pypi.org/', 'https://www.indiatoday.in/', 
+                     ]
+
+    ps = 0
+
+    def __init__(self, url):
+        self.url = url
+
+    def popularity_score(self):
+        for domain in self.popular_domains:
+            if domain == self.url:
+                self.ps += 100/len(self.popular_domains)
+            if domain in self.url:
+                self.ps += 100/len(self.popular_domains)
+
+        return self.ps
diff --git a/richa/query_processing.py b/richa/query_processing.py
@@ -0,0 +1,37 @@
+import nltk
+from nltk.corpus import stopwords
+nltk.download('punkt')
+nltk.download('stopwords')
+from nltk.tokenize import word_tokenize
+from nltk.stem.porter import PorterStemmer
+from spellchecker import SpellChecker
+import string
+
+
+
+class QueryProcessing():
+    def __init__(self, search_string):
+        self.search_string = search_string
+
+    def processor(self):
+        old_query = self.search_string
+
+        self.search_string = self.search_string.lower()
+
+        translator = str.maketrans('', '', string.punctuation)
+        self.search_string = self.search_string.translate(translator)
+
+        self.search_string = " ".join(self.search_string.split())
+
+        stop_words = set(stopwords.words("english"))
+        word_tokens = word_tokenize(self.search_string)
+        tokens = [word for word in word_tokens if word not in stop_words]
+
+        stemmer = PorterStemmer()
+        tokens = [stemmer.stem(word) for word in tokens]
+
+        spell = SpellChecker()
+        for i in range(len(tokens)):
+            tokens[i] = spell.correction(tokens[i])
+
+        return tokens
diff --git a/richa/ranking.py b/richa/ranking.py
@@ -0,0 +1,61 @@
+from operator import itemgetter
+import string
+
+
+class Ranking:
+    def __init__(self, results, query):
+        self.results = results
+        self.query = query
+
+    def search(self):
+        res = []
+        filtered = []
+        if '"' in self.query:
+            x = '"'
+            y = ' '
+            z = ' " '
+            mytable = self.query.maketrans(x, y, z)
+            res.insert(0, self.query.translate(mytable))
+        else:
+            if ':' in self.query:  # filter by url search query => query:url
+                key = self.query.split(':')[0]
+                fil = self.query.split(':')[1]
+                print(key)
+                print(fil)
+                for result in self.results:
+                    if fil.lower() in result['url'].lower():
+                        filtered.append(result)
+                self.results = filtered
+            elif '-' in self.query:
+                key = self.query.split('-')[0]
+                fil = self.query.split('-')[1]
+                for result in self.results:
+                    if fil.lower() not in result['title'].lower() or fil.lower() not in result['description'].lower():
+                        filtered.append(result)
+                self.results = filtered
+            else:
+                key = self.query
+
+            res = key.split()
+        return res
+
+    def ranked_results(self):
+
+        keywords = self.search()
+        for key in keywords:
+            for result in self.results:
+                if key.lower() in result['title'].lower():
+                    result['score'] += 2
+                if key.lower() in result['description'].lower():
+                    result['score'] += 1
+
+        return self.results
+
+    def sorted_results(self):
+
+        ranked_searches = self.ranked_results()
+
+        sorted_searches = sorted(
+            ranked_searches, key=itemgetter('popularity', 'score'), reverse=True)
+
+        return sorted_searches
diff --git a/richa/readme.md b/richa/readme.md
@@ -0,0 +1,18 @@
+# Glugle
+### A search engine which shows results fetched from a few websites
+
+
+### Demo
+
+![Alt Text](static/glugle.gif)<br>
+
+![Alt Text](static/results.png)
+
+### Tech Stack used:
+> Python<br>
+> Mondodb<br>
+> Flask
+
+### Future Aspects
+
+> It can be improved furthur by adding voice based searches and user log-in.