Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simpola-task4-Ranking #60

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions Simpola-Task4/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Flearch

<hr/>

## About the project:

Flearch is a fully functional search engine made using Python Programming Language. It displays search results based on a textual search query. MongoDB is used as database in the project. Several python libraries including BeautifulSoup, Pymongo and Requests are utilized and the web application is created using Flask.

## Tech stacks used:

- Python programming language
- MongoDB
- Flask
Binary file added Simpola-Task4/__pycache__/app.cpython-38.pyc
Binary file not shown.
119 changes: 119 additions & 0 deletions Simpola-Task4/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
from flask import Flask, render_template, url_for, request
import pymongo
from flask_paginate import Pagination, get_page_args
import os
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer


app = Flask(__name__)


@app.route("/")
def hello_name():
return render_template('home.html')


def search_string_optimizations(search_string):
# lowercasing the serch_results
search_string = search_string.lower()

# remove punctuations from the search_results
translator = str.maketrans('', '', string.punctuation)
search_string = search_string.translate(translator)

# removing stopwords and tokenization from the search_results
stop_words = set(stopwords.words("english"))
word_tokens = word_tokenize(search_string)
filtered_search_string = [
word for word in word_tokens if word not in stop_words]

# performing stemming in the search_results
stemmer = PorterStemmer()
word_tokens = word_tokenize(search_string)
stems = [stemmer.stem(word) for word in word_tokens]
return stems


def sort_rank(required, optimized_res):
for result in required:
for word in optimized_res:
if word in result['title']:
result['score'] += 2
else:
result['score'] += 0
if word in result['description']:
result['score'] += 1
else:
result['score'] += 0
print('DONE ! DONE ! DONE')
return sorted(required, key=lambda result: result['score'], reverse=True)


@app.route("/search_results")
def fun():
client = pymongo.MongoClient("mongodb://127.0.0.1:27017", connect=False)
db = client.glugledb
search_string = request.args.get('search')
search_results = []
required = []

# optimized
optimized_res = search_string_optimizations(search_string)

search_results = db.results.find(
{
"$text": {
"$search": search_string,
'$caseSensitive': False
}
},
{
"score": {
'$meta' : "textScore"
}
}). sort(
[
('sort', {'$meta': 'textScore'}),
('_id', pymongo.DESCENDING)
]
)

for object in search_results:
exist = False
for result in required:
if result['title'] == object['title'] or result['url'] == object['url']:
exist = True
break

if exist == False:
# print(dir(object))
required.append(object)
# print(required)

# Applying the ranking mechanism
required = sort_rank(required, optimized_res)

page, per_page, offset = get_page_args(
page_parameter='page', per_page_parameter='per_page')

total = search_results.count()
# return render_template('searches.html', required=required)

pagination = Pagination(page=page, per_page=per_page,
total=total, css_framework='bootstrap4')

return render_template('searches.html',
required=required[offset:offset+per_page],
page=page,
per_page=per_page,
pagination=pagination,
total=total,
search_string=search_string
)


if __name__ == '__main__':
app.run(debug=True)
100 changes: 100 additions & 0 deletions Simpola-Task4/crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import requests
from bs4 import BeautifulSoup
import urllib.parse
import urllib.request
import pymongo
import lxml
import sys


class Crawler:
client = pymongo.MongoClient("mongodb://127.0.0.1:27017")
db = client.glugledb
collection = db.info

disallowed_link = []

def start_crawl(self, url, depth):
robots = urllib.parse.urljoin(url, '/robots.txt')

try:
robot = requests.get(robots)
except:
print("Robots not found here!")
self.crawl(url, depth)

soup = BeautifulSoup(robot.text, 'lxml')
content = soup.find('p').text

for word in content:
if word[0] == '/':
self.disallowed_link.append(urllib.parse.urljoin(url, word))
print("Robots found and appended in dissallowed_links...")

self.crawl(url, depth, self.disallowed_link)

def crawl(self, url, depth, *disallowed_link):
try:
print(f"Crawling url {url} at depth: {depth}")
response = requests.get(url)
except:
print(f"Failed to perform HTTP GET request on {url}")
return
soup = BeautifulSoup(response.text, 'lxml')

try:
title = soup.find('title').text
desc = ''

for tag in soup.findAll():
if tag.name == 'p':
desc = desc + tag.text.strip().replace('\n', '')

except:
print("Failed to retrieve title and desc...")
return

query = {
'url': url,
'title': title,
'description': desc,
'score': 0,
}

results = self.db.results
results.insert_one(query)
results.create_index(
[
('url', pymongo.TEXT),
('title', pymongo.TEXT),
('desc', pymongo.TEXT),
('score', 1)
],
name='results',
default_language='english'
)

if depth == 0:
return

links = soup.findAll('a')

for link in links:
try:
if link['href'] not in disallowed_link:
if 'http' in link['href']:
self.crawl(link['href'], depth-1, disallowed_link)
else:
link['href'] = urllib.parse.urljoin(url, link['href'])
self.crawl(link['href'], depth-1, disallowed_link)
except KeyError:
print("No links retrieved from the page")
pass

self.client.close()


spider = Crawler()
spider.start_crawl(
sys.argv[1], int(sys.argv[2])
)
38 changes: 38 additions & 0 deletions Simpola-Task4/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
appdirs==1.4.3
asgiref==3.2.3
beautifulsoup4==4.7.1
cachelib==0.4.1
certifi==2018.11.29
chardet==3.0.4
charset-normalizer==2.0.7
click==8.0.3
defusedxml==0.5.0
distlib==0.3.0
Django==3.0.2
django-allauth==0.38.0
django-braces==1.13.0
django-crispy-forms==1.8.1
django-datetimepicker==3.14
dnspython==2.1.0
Flask==2.0.2
flask-paginate==0.8.1
Flask-Session==0.4.0
gunicorn==20.1.0
idna==2.8
itsdangerous==2.0.1
Jinja2==3.0.2
lxml==4.6.3
Markdown==3.2.2
MarkupSafe==2.0.1
numpy==1.21.3
oauthlib==3.0.0
Pillow==8.4.0
pymongo==3.12.1
python3-openid==3.1.0
pytz==2018.9
requests==2.21.0
requests-oauthlib==1.2.0
soupsieve==1.8
sqlparse==0.3.0
urllib3==1.24.2
Werkzeug==2.0.2
Loading