lugnitdgp · Techno-Simpola · Oct 24, 2021 · Oct 25, 2021 · Oct 26, 2021 · Oct 28, 2021
diff --git a/Simpola-Task4/README.md b/Simpola-Task4/README.md
@@ -0,0 +1,13 @@
+# Flearch
+
+<hr/>
+
+## About the project:
+
+Flearch is a fully functional search engine made using Python Programming Language. It displays search results based on a textual search query. MongoDB is used as database in the project. Several python libraries including BeautifulSoup, Pymongo and Requests are utilized and the web application is created using Flask. 
+
+## Tech stacks used:
+
+- Python programming language
+- MongoDB
+- Flask
diff --git a/Simpola-Task4/__pycache__/app.cpython-38.pyc b/Simpola-Task4/__pycache__/app.cpython-38.pyc
diff --git a/Simpola-Task4/app.py b/Simpola-Task4/app.py
@@ -0,0 +1,119 @@
+from flask import Flask, render_template, url_for, request
+import pymongo
+from flask_paginate import Pagination, get_page_args
+import os
+import string
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+from nltk.stem.porter import PorterStemmer
+
+
+app = Flask(__name__)
+
+
+@app.route("/")
+def hello_name():
+    return render_template('home.html')
+
+
+def search_string_optimizations(search_string):
+    # lowercasing the serch_results
+    search_string = search_string.lower()
+
+    # remove punctuations from the search_results
+    translator = str.maketrans('', '', string.punctuation)
+    search_string = search_string.translate(translator)
+
+    # removing stopwords and tokenization from the search_results
+    stop_words = set(stopwords.words("english"))
+    word_tokens = word_tokenize(search_string)
+    filtered_search_string = [
+        word for word in word_tokens if word not in stop_words]
+
+    # performing stemming in the search_results
+    stemmer = PorterStemmer()
+    word_tokens = word_tokenize(search_string)
+    stems = [stemmer.stem(word) for word in word_tokens]
+    return stems
+
+
+def sort_rank(required, optimized_res):
+    for result in required:
+        for word in optimized_res:
+            if word in result['title']:
+                result['score'] += 2
+            else:
+                result['score'] += 0
+            if word in result['description']:
+                result['score'] += 1
+            else:
+                result['score'] += 0
+    print('DONE ! DONE ! DONE')
+    return sorted(required, key=lambda result: result['score'], reverse=True)
+
+
+@app.route("/search_results")
+def fun():
+    client = pymongo.MongoClient("mongodb://127.0.0.1:27017", connect=False)
+    db = client.glugledb
+    search_string = request.args.get('search')
+    search_results = []
+    required = []
+
+    # optimized
+    optimized_res = search_string_optimizations(search_string)
+
+    search_results = db.results.find(
+        {
+            "$text": {
+                "$search": search_string,
+                '$caseSensitive': False
+            }
+        },
+        {
+            "score": {
+                '$meta' : "textScore"
+            }
+        }). sort(
+            [
+                ('sort', {'$meta': 'textScore'}),
+                ('_id', pymongo.DESCENDING)
+            ]
+        )
+
+    for object in search_results:
+        exist = False
+        for result in required:
+            if result['title'] == object['title'] or result['url'] == object['url']:
+                exist = True
+                break
+
+        if exist == False:
+            # print(dir(object))
+            required.append(object)
+        # print(required)
+
+     # Applying the ranking mechanism
+    required = sort_rank(required, optimized_res)
+
+    page, per_page, offset = get_page_args(
+        page_parameter='page', per_page_parameter='per_page')
+
+    total = search_results.count()
+    # return render_template('searches.html', required=required)
+
+    pagination = Pagination(page=page, per_page=per_page,
+                            total=total, css_framework='bootstrap4')
+
+    return render_template('searches.html',
+                           required=required[offset:offset+per_page],
+                           page=page,
+                           per_page=per_page,
+                           pagination=pagination,
+                           total=total,
+                           search_string=search_string
+                           )
+
+
+if __name__ == '__main__':
+    app.run(debug=True)
diff --git a/Simpola-Task4/crawler.py b/Simpola-Task4/crawler.py
@@ -0,0 +1,100 @@
+import requests
+from bs4 import BeautifulSoup
+import urllib.parse
+import urllib.request
+import pymongo
+import lxml
+import sys
+
+
+class Crawler:
+    client = pymongo.MongoClient("mongodb://127.0.0.1:27017")
+    db = client.glugledb
+    collection = db.info
+
+    disallowed_link = []
+
+    def start_crawl(self, url, depth):
+        robots = urllib.parse.urljoin(url, '/robots.txt')
+
+        try:
+            robot = requests.get(robots)
+        except:
+            print("Robots not found here!")
+            self.crawl(url, depth)
+
+        soup = BeautifulSoup(robot.text, 'lxml')
+        content = soup.find('p').text
+
+        for word in content:
+            if word[0] == '/':
+                self.disallowed_link.append(urllib.parse.urljoin(url, word))
+        print("Robots found and appended in dissallowed_links...")
+
+        self.crawl(url, depth, self.disallowed_link)
+
+    def crawl(self, url, depth, *disallowed_link):
+        try:
+            print(f"Crawling url {url} at depth: {depth}")
+            response = requests.get(url)
+        except:
+            print(f"Failed to perform HTTP GET request on {url}")
+            return
+        soup = BeautifulSoup(response.text, 'lxml')
+
+        try:
+            title = soup.find('title').text
+            desc = ''
+
+            for tag in soup.findAll():
+                if tag.name == 'p':
+                    desc = desc + tag.text.strip().replace('\n', '')
+
+        except:
+            print("Failed to retrieve title and desc...")
+            return
+
+        query = {
+            'url': url,
+            'title': title,
+            'description': desc,
+            'score': 0,
+        }
+
+        results = self.db.results
+        results.insert_one(query)
+        results.create_index(
+            [
+                ('url', pymongo.TEXT),
+                ('title', pymongo.TEXT),
+                ('desc', pymongo.TEXT),
+                ('score', 1)
+            ],
+            name='results',
+            default_language='english'
+        )
+
+        if depth == 0:
+            return
+
+        links = soup.findAll('a')
+
+        for link in links:
+            try:
+                if link['href'] not in disallowed_link:
+                    if 'http' in link['href']:
+                        self.crawl(link['href'], depth-1, disallowed_link)
+                    else:
+                        link['href'] = urllib.parse.urljoin(url, link['href'])
+                        self.crawl(link['href'], depth-1, disallowed_link)
+            except KeyError:
+                print("No links retrieved from the page")
+                pass
+
+        self.client.close()
+
+
+spider = Crawler()
+spider.start_crawl(
+    sys.argv[1], int(sys.argv[2])
+)
diff --git a/Simpola-Task4/requirements.txt b/Simpola-Task4/requirements.txt
@@ -0,0 +1,38 @@
+appdirs==1.4.3
+asgiref==3.2.3
+beautifulsoup4==4.7.1
+cachelib==0.4.1
+certifi==2018.11.29
+chardet==3.0.4
+charset-normalizer==2.0.7
+click==8.0.3
+defusedxml==0.5.0
+distlib==0.3.0
+Django==3.0.2
+django-allauth==0.38.0
+django-braces==1.13.0
+django-crispy-forms==1.8.1
+django-datetimepicker==3.14
+dnspython==2.1.0
+Flask==2.0.2
+flask-paginate==0.8.1
+Flask-Session==0.4.0
+gunicorn==20.1.0
+idna==2.8
+itsdangerous==2.0.1
+Jinja2==3.0.2
+lxml==4.6.3
+Markdown==3.2.2
+MarkupSafe==2.0.1
+numpy==1.21.3
+oauthlib==3.0.0
+Pillow==8.4.0
+pymongo==3.12.1
+python3-openid==3.1.0
+pytz==2018.9
+requests==2.21.0
+requests-oauthlib==1.2.0
+soupsieve==1.8
+sqlparse==0.3.0
+urllib3==1.24.2
+Werkzeug==2.0.2