lugnitdgp · rk18venom · Oct 29, 2021 · Oct 29, 2021
diff --git a/Rahul/task4/Crawler/__pycache__/popularlinks.cpython-39.pyc b/Rahul/task4/Crawler/__pycache__/popularlinks.cpython-39.pyc
diff --git a/Rahul/task4/Crawler/crawler.py b/Rahul/task4/Crawler/crawler.py
@@ -0,0 +1,114 @@
+from bs4 import BeautifulSoup
+import requests
+import pymongo
+import urllib
+import sys
+from popularlinks import Popularity
+import lxml
+
+
+class Crawler():
+
+    connection_url = "mongodb://127.0.0.1:27017/"
+
+    client = pymongo.MongoClient(connection_url)
+
+    db = client.databaseofglugle
+
+    disallowed_links = []
+    url_count = 1
+    def start_crawl(self, url, depth):
+        robots_url = urllib.parse.urljoin(url, '/robots.txt')
+
+        try:
+            robots = requests.get(robots_url)
+        except:
+            print("robots not found!!!")
+            self.crawl(url, depth)
+
+        soup = BeautifulSoup(robots.text, 'lxml')
+
+        sample_content = soup.find('p').text
+        content = sample_content.split()
+        for word in content:
+            if word[0] == '/':
+                self.disallowed_links.append(urllib.parse.urljoin(url, word))
+
+        print("robots found and appended in disallowed_links...")
+
+        self.crawl(url, depth, self.disallowed_links)
+
+    def crawl(self, url, depth, *disallowed_links):
+
+        try:
+            print(f"Crawling url {url} at depth: {depth}")
+            self.url_count +=1
+            response = requests.get(url)
+        except:
+            print(f"Failed to perform HTTP GET request on {url}")
+            return
+
+        soup = BeautifulSoup(response.text, 'lxml')
+
+        try:
+            title = soup.find('title').text
+            description = ''
+
+            for tag in soup.findAll():
+                if tag.name == 'p':
+                    description += tag.text.strip().replace('\n', '')
+
+        except:
+            print("Failed to retrieve title and description...")
+            return
+        popularity = Popularity(url)
+        popularity_score = popularity.popularity_score()
+        query = {
+            'url': url,
+            'title': title,
+            'description': description,
+            'score': 0,
+            'popularity': popularity_score,
+        }
+
+        search_results = self.db.search_results
+
+        search_results.insert_one(query)
+
+        search_results.create_index(
+            [
+                ('url', pymongo.TEXT),
+                ('title', pymongo.TEXT),
+                ('description', pymongo.TEXT),
+                ('score', 1),
+                ('popularity',1)
+            ],
+            name='search_results',
+            default_language="english"
+        )
+
+        if depth == 0:
+            return
+
+        links = soup.findAll('a')
+
+        for link in links:
+            try:
+                if link['href'] not in disallowed_links[0]:
+                    if 'http' in link['href']:
+                        self.crawl(link['href'], depth-1, disallowed_links[0])
+                    else:
+                        link['href'] = urllib.parse.urljoin(url, link['href'])
+                        self.crawl(link['href'], depth-1, disallowed_links[0])
+            except KeyError:
+                print("no links retrieved from the page")
+                pass
+
+        self.client.close()
+
+
+crawler = Crawler()
+
+crawler.start_crawl(
+    sys.argv[1], int(sys.argv[2])
+)
diff --git a/Rahul/task4/Crawler/popularlinks.py b/Rahul/task4/Crawler/popularlinks.py
@@ -0,0 +1,18 @@
+class Popularity():
+    popular_domains = ['https://en.wikipedia.org/', 'https://www.python.org/', 'https://www.rottentomatoes.com/',
+                       'https://pypi.org/', 'https://www.indiatoday.in/', 'https://www.geeksforgeeks.org/',
+                       'https://stackoverflow.com/']
+
+    ps = 0
+
+    def __init__(self, url):
+        self.url = url
+
+    def popularity_score(self):
+        for domain in self.popular_domains:
+            if domain == self.url:
+                self.ps += 100/len(self.popular_domains)
+            if domain in self.url:
+                self.ps += 100/len(self.popular_domains)
+
+        return self.ps 
diff --git a/Rahul/task4/Read.md b/Rahul/task4/Read.md
@@ -0,0 +1,20 @@
+In this project I have added two features. These are:
+
+        1. Voice Search (it is totally implemented)
+        2. Login System through which we can sync the data with user's account as similar to gmail and google.(it is not implemented fully but i am trying to do so)
+        3. One Home Key--- it will help the user to comeback to home page from any page which is far away from it. 
+
+In this project I want to some more features. These are:
+
+        1. Voice Search Assitant --- Through which we can Search the only by using microphone like "Glugle Search Python" after saying this a new page will get open with results.
+        2. Syncing the Google Searches with your login account which will help in syncing your searches and if user ever changes the device then user will retrive all those things according to his likes.
+        3. PageRanking Algorithm
+        4. Settings Menu
+
+
+Voice Search --- I have made this one using Webkitsearch API which helps in converting the voice into text.
+
+
+
+Gif of Glugle --- [Refrence link](https://drive.google.com/file/d/14b2AJEh2mFW4VfxvtI9fsm1azd_iUT4y/view?usp=sharing)
+
diff --git a/Rahul/task4/Templates/base.html b/Rahul/task4/Templates/base.html
@@ -0,0 +1,37 @@
+<!doctype html>
+<html lang="en">
+
+<head>
+    <!-- Required meta tags -->
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+
+    <!-- Bootstrap CSS -->
+    <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet"
+        integrity="sha384-F3w7mX95PdgyTmZZMECAngseQB83DfGTowi0iMjiWaeVhAn4FJkqJByhZMI3AhiU" crossorigin="anonymous">
+
+    <style>
+        .form-control:focus {
+            border-color: #43971b;
+            box-shadow: 0px 1px 1px rgba(0, 0, 0, 0.075) inset, 0px 0px 8px rgba(255, 100, 255, 0.5);
+        }
+
+        .dropdown {
+            float: left;
+        }
+    </style>
+
+    <title>Glugle</title>
+</head>
+
+<body style="background-color: rgb(224, 226, 226);">
+    {% block content %}{% endblock %}
+
+
+    <!-- Option 1: Bootstrap Bundle with Popper -->
+    <script src="https://cdn.jsdelivr.net/npm/[email protected]/dist/js/bootstrap.bundle.min.js"
+        integrity="sha384-/bQdsTh/da6pkI1MST/rWKFNjaCP5gBSY4sEBT38Q/9RBh9AH40zEOg7Hlq2THRZ"
+        crossorigin="anonymous"></script>
+</body>
+
+</html>
diff --git a/Rahul/task4/Templates/footer.html b/Rahul/task4/Templates/footer.html
@@ -0,0 +1,9 @@
+<div class="footer">
+    <h5 style="text-align:center">Welcome to Glugle search engine.</h5>
+        <style>
+            h5 {
+            color: maroon;
+            margin-top: 40px;
+        }
+        </style>
+ </div>
diff --git a/Rahul/task4/Templates/header.html b/Rahul/task4/Templates/header.html
@@ -0,0 +1,16 @@
+
+<!DOCTYPE html>
+<html>
+<head>
+  <link rel="stylesheet" href="main.css">
+<body>
+  <div class="topnav">
+
+    <a class="active" href="/">Home</a>
+    <a href="/login">Login</a>
+    <a href="/signup">Sign Up</a>
+
+  </div>
+
+</body>
+</html>
diff --git a/Rahul/task4/Templates/home.html b/Rahul/task4/Templates/home.html
@@ -0,0 +1,58 @@
+{% extends 'base.html' %}
+
+
+    {% block content %}
+    {% include 'header.html' %}
+    {% if logged_in == False %}
+    {% include 'login_and_signup.html' %}
+    {% endif %}
+
+
+
+        <div class="col">
+
+
+            <div class="col mt-5">
+                <img src="{{url_for('static' , filename='./images/logo.png')}}" class="mx-auto d-block"
+                    style="width: 25%; height: 25%;">
+                <h1 style="color:rgb(91, 94, 92); text-align:center;font-family: 'Times New Roman', Times, serif;margin-top: 15px;padding: 15px;"
+                    class="mt-2"><b>Glugle Search</b></h2>
+            </div>
+
+
+            <form class="mt-5 container" name="search" style="width: 50%" action="/search_results">
+                <div class="col-6 mx-auto input-group">
+                    <input type="text" class="form-control" id="transcript" name="search" placeholder="search...">
+                    <img onclick="startDictation()" src="https://i.imgur.com/cHidSVu.gif" />
+                    <button type="submit" class="btn btn-success">Search</button>
+                </div>
+
+                <script>
+
+                    function startDictation() {
+
+                      if (window.hasOwnProperty('webkitSpeechRecognition')) {
+
+                        var recognition = new webkitSpeechRecognition();
+
+                        recognition.continuous = false;
+                        recognition.interimResults = false;
+                        recognition.lang = "en-US";
+                        recognition.start();
+
+                        recognition.onresult = function (e) {
+                          document.getElementById('transcript').value = e.results[0][0].transcript;
+                          recognition.stop();
+                          document.getElementById('').submit();
+                        };
+                        recognition.onerror = function(e) {
+                          recognition.stop();
+                        }
+                      }
+                    }
+
+                  </script>
+            </form>
+        </div>
+        {% include 'footer.html' %}
+{% endblock %}
diff --git a/Rahul/task4/Templates/login.html b/Rahul/task4/Templates/login.html
@@ -0,0 +1,46 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta http-equiv="X-UA-Compatible" content="IE=edge">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Login</title>
+</head>
+<body>
+    <div id="loginrobly" class="ribbon l-box-lrg pure-g">
+        {#        <div class="l-box-lrg is-center pure-u-1 pure-u-med-1-2 pure-u-lrg-2-5">#}
+        {#            <img class="pure-img-responsive" alt="File Icons" width="300" src="/static/img/common/file-icons.png">#}
+        {#        </div>#}
+                <div class="pure-u-1 pure-u-med-1-2 pure-u-lrg-3-5">
+
+                    <h2 class="content-head content-head-ribbon">Login.</h2>
+
+                    <form class="pure-form pure-form-stacked" action="/login" method="POST" accept-charset="utf-8">
+                            <fieldset>
+                                <label for="email">Your Email</label>
+                                <input id="email" name="email" type="email" required="true" placeholder="Your Email"><br>
+
+                                <label for="password">Your Password</label>
+                                <input id="password" name="password" type="password" required="true" placeholder="Your Password"><br>
+
+                                <button type="submit" class="pure-button">Login</button>
+                            </fieldset>
+                        </form>
+                </div>
+            </div>
+            <style>
+                body {
+                    background-color: aquamarine;
+                }
+                form{
+                    background-color: azure;
+                }
+                label{
+                    font-family: 'Times New Roman', Times, serif;
+                }
+                input{
+                    margin: 10px;
+                }
+            </style>
+</body>
+</html>