From aa15330c027e227a11a093969031379db1125f3c Mon Sep 17 00:00:00 2001 From: md manzer alam Date: Wed, 27 Oct 2021 09:59:45 +0530 Subject: [PATCH 1/3] finished task 3, website building --- .../__pycache__/website.cpython-39.pyc | Bin 0 -> 1434 bytes MdManzerAlam_Task3/crawler/Crawler.py | 84 ++++++++++ MdManzerAlam_Task3/main.py | 3 + MdManzerAlam_Task3/models/query.py | 33 ++++ MdManzerAlam_Task3/static/css/index.css | 81 ++++++++++ MdManzerAlam_Task3/static/css/results.css | 145 ++++++++++++++++++ MdManzerAlam_Task3/templates/index.html | 23 +++ MdManzerAlam_Task3/templates/result.html | 41 +++++ MdManzerAlam_Task3/website.py | 48 ++++++ 9 files changed, 458 insertions(+) create mode 100644 MdManzerAlam_Task3/__pycache__/website.cpython-39.pyc create mode 100644 MdManzerAlam_Task3/crawler/Crawler.py create mode 100644 MdManzerAlam_Task3/main.py create mode 100644 MdManzerAlam_Task3/models/query.py create mode 100644 MdManzerAlam_Task3/static/css/index.css create mode 100644 MdManzerAlam_Task3/static/css/results.css create mode 100644 MdManzerAlam_Task3/templates/index.html create mode 100644 MdManzerAlam_Task3/templates/result.html create mode 100644 MdManzerAlam_Task3/website.py diff --git a/MdManzerAlam_Task3/__pycache__/website.cpython-39.pyc b/MdManzerAlam_Task3/__pycache__/website.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ec2ee34d5ef6d7a0dafc495ff639640a145e82f5 GIT binary patch literal 1434 zcmY*ZPjA~c6c;H`l4Z$Bx_=n3!#3n1psg{iJro#*4hcG-Sb|~=dJr}Uj7i(FB~d2n zH8G@9y2B3n20J9jeV*NT+)n!n-64CFmo5{*=l4kZ{rU0tNRuRT;M@ECj6U!k=TB?g z{ai4f!c;8?<}g$`70OVBGhDfuTX~sRk&Ga_hfBW-vY^-9GOPyKpo+2xIUMof1DB`=+H^cQ5{q{NVCB~?qa zBYDlZ%r&o?l4@=O$yaNxv4PgSA$!{*$0)}_ru*{mEa6Ofz1SaP*4!QKCoAHkkI+b91AC^mV9FL39y z&H{C)MGM?v?Jm44!rW`rx!P~t3jlrQbRJC7;;Vppt!ppsS6m=C+1|zzD27Z^>2I`qvO^^;{2F)L%FL z!R-cJ)D3mik_*(ude{!ysEt{|Mwecf=$P%aq#eSDK5~A>5X0^#km2s757)AHYzC#6 z&~h%6E`BK-#}FWfJl~Lg&Cm3+6mNok#i^XkH>3RtRs1Ed0p8*#Zip^)$&CxdIWe9k zg$WyAnblI9je43iRMLuTF0*^LdvM@(EjkrKD=lgBDD~t$0A0QZ0$dAC%Pg5Fl~3*A z*Fr8g!|lCoDMT!HS8E_IyYt5gg5Gi!t+gY+#G3E~0-tWWeZ96q6Mj_*wl4YC2rOkW`UI-k#i1Lb$j;A0^ba0#b2R_} literal 0 HcmV?d00001 diff --git a/MdManzerAlam_Task3/crawler/Crawler.py b/MdManzerAlam_Task3/crawler/Crawler.py new file mode 100644 index 00000000..4ffe7b40 --- /dev/null +++ b/MdManzerAlam_Task3/crawler/Crawler.py @@ -0,0 +1,84 @@ +from urllib.parse import * +from urllib import robotparser as rb +from urllib.robotparser import RobotFileParser +import requests +from bs4 import BeautifulSoup +from pymongo import MongoClient +from models.query import Query +class Crawler: + def __init__(self) -> None: + self.client = MongoClient('localhost',27017) + pass + def start_crawl(self,url): + robo_file_parser = rb.RobotFileParser() + robo_file_parser.set_url(url+'/robots.txt') + robo_file_parser.read() + self.url = url + self.crawl(url=url,depth=2,rf_parser=robo_file_parser) + + def crawl(self,url,depth,rf_parser:RobotFileParser): + print(url+" | "+str(depth)) + + if depth == 0: + # todo parse single page + if rf_parser.can_fetch("*",url=url): + + try: + request = requests.get(url) + soup = BeautifulSoup(request.text,"lxml") + desc = "" + try: + for p_tag in soup.find_all("p"): + desc+=str(p_tag.text) + except: + desc = "empty" + title = "" + try: + title = soup.find('title').text + except: + title = "empty" + query = Query(title=title,url=url,description=desc) + query.save(self.client) + except: + pass + pass + else : + # todo parse multiple pages + if rf_parser.can_fetch("*",url=url): + try: + request = requests.get(url) + soup = BeautifulSoup(request.text,"lxml") + desc = "" + try: + for p_tag in soup.find_all("p"): + desc+=str(p_tag) + except: + desc = "empty" + title = "" + try: + title = soup.find('title').text + except: + title = "empty" + query = Query(title=title,url=url,description=desc) + query.save(self.client) + in_page_links_raw = soup.find_all("a") + in_page_links = [] + for a_tag in in_page_links_raw: + try: + param = str(a_tag["href"]) + except: + param = "" + continue + if param.startswith(" "): + param.removeprefix(" ") + if param.startswith("http"): + in_page_links.append(param) + elif param.startswith("//www"): + in_page_links.append("https://"+param.removeprefix("//")) + else: + in_page_links.append(self.url+param) + for link in in_page_links: + self.crawl(link,depth-1,rf_parser) + except: + pass + pass \ No newline at end of file diff --git a/MdManzerAlam_Task3/main.py b/MdManzerAlam_Task3/main.py new file mode 100644 index 00000000..aa70f251 --- /dev/null +++ b/MdManzerAlam_Task3/main.py @@ -0,0 +1,3 @@ +from crawler.Crawler import Crawler +obj = Crawler() +obj.start_crawl("https://www.rottentomatoes.com") \ No newline at end of file diff --git a/MdManzerAlam_Task3/models/query.py b/MdManzerAlam_Task3/models/query.py new file mode 100644 index 00000000..c5155ef1 --- /dev/null +++ b/MdManzerAlam_Task3/models/query.py @@ -0,0 +1,33 @@ +import json +from pymongo import MongoClient +from pymongo import TEXT +class Query: + def __init__(self,title,url,description) -> None: + self.title = title + self.url = url + self.description = description + + def __str__(self) -> str: + obj = { + 'url' : self.url, + 'title' : self.title, + 'description' : self.description, + } + return json.dumps(obj) + + def save(self,client): + obj = { + 'url' : self.url, + 'title' : self.title, + 'description' : self.description, + } + db = client.gluggle + collection = db.queries + collection.insert_one(obj) + collection.create_index([ + ('url', TEXT), + ('title', TEXT), + ('description', TEXT), + ], name='search_results', default_language='english') + client.close() + \ No newline at end of file diff --git a/MdManzerAlam_Task3/static/css/index.css b/MdManzerAlam_Task3/static/css/index.css new file mode 100644 index 00000000..5cd59da1 --- /dev/null +++ b/MdManzerAlam_Task3/static/css/index.css @@ -0,0 +1,81 @@ +@import url('https://fonts.googleapis.com/css2?family=Comfortaa:wght@300;400;500;600;700&family=Righteous&display=swap'); +*{ + font-family: 'Comfortaa',sans-serif; + border: none; + outline: none; + text-decoration: none; + list-style: none; + margin: 0px; + padding: 0px; +} +html{ + font-size: 16px; +} +h1{ + font-size: 3rem; + font-weight: 400; + letter-spacing: 0.2rem; + font-family: 'Righteous', cursive; +} +h3{ + font-size: 1.5rem; + font-weight: 400; +} +@media screen and (max-width: 500px) { + html{ + font-size: 14px; + } +} +body{ + background-color: rgb(32, 32, 32); + color: hsl(0, 0%, 98%); +} +header{ + position: fixed; + top: 50%; + left: 50%; + -webkit-transform: translate(-50%, -50%); + transform: translate(-50%, -50%); + display: flex; + justify-content: center; + flex-direction: column; + align-items: center; + text-align: center; + width: 80%; + /* max-width: 1000px; */ + gap: 2em; +} +header form{ + width: 100%; +} +header fieldset{ + display: flex; + gap: 1em; + width: 100%; + max-width: 900px; + margin: 0px auto; +} +input,button{ + width: 80%; + font-size: 1.25rem; + padding: 1em 1em; + border-radius: 100px; + color: hsl(0, 0%, 98%); + background-color: hsl(0, 0%, 4%); + margin: 0.5em auto; +} +#search-btn{ + background-color: #4fa; + font-family: 'Righteous', cursive; + color: black; + font-weight: 700; + padding: 0.5em 1em; + font-weight: 400; + letter-spacing: 0.2rem; + width: auto; +} +@media screen and (max-width: 500px) { + fieldset{ + flex-direction: column; + } +} \ No newline at end of file diff --git a/MdManzerAlam_Task3/static/css/results.css b/MdManzerAlam_Task3/static/css/results.css new file mode 100644 index 00000000..76ef0990 --- /dev/null +++ b/MdManzerAlam_Task3/static/css/results.css @@ -0,0 +1,145 @@ +@import url('https://fonts.googleapis.com/css2?family=Comfortaa:wght@300;400;500;600;700&family=Righteous&display=swap'); +*{ + font-family: 'Comfortaa',sans-serif; + border: none; + outline: none; + text-decoration: none; + list-style: none; + margin: 0px; + padding: 0px; + text-decoration: none; + color: inherit; +} +html{ + font-size: 16px; +} +h1{ + font-size: 3rem; + font-weight: 400; + margin: auto auto; + letter-spacing: 0.2rem; + font-family: 'Righteous', cursive; +} +h3{ + font-size: 1.5rem; + font-weight: 400; +} +@media screen and (max-width: 500px) { + html{ + font-size: 14px; + } +} +body{ + background-color: rgb(32, 32, 32); + color: hsl(0, 0%, 98%); +} +header{ + position: sticky; + margin: 1rem auto; + display: flex; + justify-content: center; + flex-direction: column; + align-items: center; + text-align: center; + width: 80%; + gap: 2em; +} +header form{ + width: 100%; + display: flex; + justify-content: center; + gap: 1em; +} +header fieldset{ + display: flex; + gap: 1em; + width: 100%; + max-width: 1000px; + margin: 0px auto; + text-align: center; +} +input,button{ + width: 80%; + font-size: 1.25rem; + padding: 1em 1em; + border-radius: 100px; + color: hsl(0, 0%, 98%); + background-color: hsl(0, 0%, 4%); + margin: 0.5em auto; +} +#search-btn{ + background-color: #4fa; + font-family: 'Righteous', cursive; + color: black; + font-weight: 700; + padding: 0.5em 1em; + font-weight: 400; + letter-spacing: 0.2rem; + width: auto; +} +main{ + padding: 0.5em 10vw; +} +.results{ + display: flex; + flex-direction: column; + justify-content: center; + gap: 1em; + margin-top: 1em; +} +.results .card{ + border-radius: 1em; + padding: 0.75em 1em; + background-color: hsl(0, 0%, 20%); + display: flex; + flex-direction: column; + gap: 0.5rem; +} +.card a{ + color: #4fa; +} +@media screen and (max-width: 500px) { + header form{ + flex-direction: column; + } + header form h1{ + font-size: 2rem; + } + header fieldset{ + flex-direction: row; + gap: 0.5em; + } + header{ + width: 90%; + } + main{ + padding: 0.5em 5vw; + } +} +.pagination{ + margin: 1em auto; + display: flex; + justify-content: center; + gap: 2em; +} +.pagination li{ + margin: auto 0px; +} +.pagination span,li{ + background-color: hsl(0, 0%, 20%); + padding: 0.5em; + border-radius: 0.5rem; +} +.pagination li.active{ + background-color: hsl(0, 0%, 50%); + padding: 0.5em; + border-radius: 0.5rem; +} +.pagination li.active span{ + background-color: hsl(0, 0%, 50%); + padding: 0.5em; + border-radius: 0.5rem; +} +.disabled{ + display: none; +} \ No newline at end of file diff --git a/MdManzerAlam_Task3/templates/index.html b/MdManzerAlam_Task3/templates/index.html new file mode 100644 index 00000000..397c506f --- /dev/null +++ b/MdManzerAlam_Task3/templates/index.html @@ -0,0 +1,23 @@ + + + + + + + Gluggle + + + +
+

Gluggle

+

Website Crawler made with Python and ❤️

+
+
+ + + +
+
+
+ + \ No newline at end of file diff --git a/MdManzerAlam_Task3/templates/result.html b/MdManzerAlam_Task3/templates/result.html new file mode 100644 index 00000000..334cffcf --- /dev/null +++ b/MdManzerAlam_Task3/templates/result.html @@ -0,0 +1,41 @@ + + + + + + + Gluggle + + + +
+
+

Gluggle

+
+ + + +
+
+
+
+

Results

+
    + {% if required %} + {% for link in required %} + +

    {{link.title}}

    + {{link.url}} +

    {{link.description[:250]}}

    +
    + {% endfor %} + + {% else %} +
    No results found
    + {% endif %} +
+
+ + \ No newline at end of file diff --git a/MdManzerAlam_Task3/website.py b/MdManzerAlam_Task3/website.py new file mode 100644 index 00000000..a32edaf9 --- /dev/null +++ b/MdManzerAlam_Task3/website.py @@ -0,0 +1,48 @@ +from flask import Flask,render_template,request,redirect,url_for +from pymongo import MongoClient +from flask_paginate import Pagination, get_page_args + +app = Flask(__name__) +# app.debug = True + +@app.route("/") +def home(): + return render_template("index.html") + +@app.route("/results/") +def result(): + client = MongoClient('localhost',27017) + database = client.gluggle + collection = database.queries + search_string = request.args.get('query') + print(search_string) + search_result = [] + required = [] + search_result = collection.find( + {"$text": {"$search": search_string, '$caseSensitive': False}}) + for object in search_result: + exist = False + for result in required: + if result['title'] == object['title'] or result['url'] == object['url']: + exist = True + + break + + if exist == False: + required.append(object) + page, per_page, offset = get_page_args(page_parameter='page', + per_page_parameter='per_page') + total = len(required) + + pagination = Pagination(page=page, per_page=per_page, total=total, + css_framework='bootstrap4') + return render_template('result.html', + required=required[offset:offset+per_page], + page=page, + per_page=per_page, + pagination=pagination, + q=search_string + ) + +if __name__ == "__main__": + app.run(debug=True) \ No newline at end of file From e7c9c74c93b93a6843454f1afa3bee4ac1a87c3c Mon Sep 17 00:00:00 2001 From: md manzer alam Date: Wed, 27 Oct 2021 10:08:13 +0530 Subject: [PATCH 2/3] fixed overflow --- MdManzerAlam_Task3/templates/result.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MdManzerAlam_Task3/templates/result.html b/MdManzerAlam_Task3/templates/result.html index 334cffcf..b8a7c5d2 100644 --- a/MdManzerAlam_Task3/templates/result.html +++ b/MdManzerAlam_Task3/templates/result.html @@ -25,7 +25,7 @@

Results

{% for link in required %}

{{link.title}}

- {{link.url}} + {{link.url}}

{{link.description[:250]}}

{% endfor %} From 4ffc6e19ac904605149ea33f2b83c5499d1effb3 Mon Sep 17 00:00:00 2001 From: md manzer alam Date: Thu, 28 Oct 2021 23:48:52 +0530 Subject: [PATCH 3/3] finished task 4 --- .../__pycache__/website.cpython-39.pyc | Bin 1434 -> 0 bytes MdManzerAlam_Task3/website.py | 48 -------- .../__pycache__/website.cpython-39.pyc | Bin 0 -> 3027 bytes .../crawler/Crawler.py | 0 MdManzerAlam_Task4/hello.py | 2 + .../main.py | 0 .../models/query.py | 0 .../static/css/index.css | 0 .../static/css/results.css | 0 .../templates/index.html | 0 .../templates/result.html | 0 MdManzerAlam_Task4/website.py | 111 ++++++++++++++++++ 12 files changed, 113 insertions(+), 48 deletions(-) delete mode 100644 MdManzerAlam_Task3/__pycache__/website.cpython-39.pyc delete mode 100644 MdManzerAlam_Task3/website.py create mode 100644 MdManzerAlam_Task4/__pycache__/website.cpython-39.pyc rename {MdManzerAlam_Task3 => MdManzerAlam_Task4}/crawler/Crawler.py (100%) create mode 100644 MdManzerAlam_Task4/hello.py rename {MdManzerAlam_Task3 => MdManzerAlam_Task4}/main.py (100%) rename {MdManzerAlam_Task3 => MdManzerAlam_Task4}/models/query.py (100%) rename {MdManzerAlam_Task3 => MdManzerAlam_Task4}/static/css/index.css (100%) rename {MdManzerAlam_Task3 => MdManzerAlam_Task4}/static/css/results.css (100%) rename {MdManzerAlam_Task3 => MdManzerAlam_Task4}/templates/index.html (100%) rename {MdManzerAlam_Task3 => MdManzerAlam_Task4}/templates/result.html (100%) create mode 100644 MdManzerAlam_Task4/website.py diff --git a/MdManzerAlam_Task3/__pycache__/website.cpython-39.pyc b/MdManzerAlam_Task3/__pycache__/website.cpython-39.pyc deleted file mode 100644 index ec2ee34d5ef6d7a0dafc495ff639640a145e82f5..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1434 zcmY*ZPjA~c6c;H`l4Z$Bx_=n3!#3n1psg{iJro#*4hcG-Sb|~=dJr}Uj7i(FB~d2n zH8G@9y2B3n20J9jeV*NT+)n!n-64CFmo5{*=l4kZ{rU0tNRuRT;M@ECj6U!k=TB?g z{ai4f!c;8?<}g$`70OVBGhDfuTX~sRk&Ga_hfBW-vY^-9GOPyKpo+2xIUMof1DB`=+H^cQ5{q{NVCB~?qa zBYDlZ%r&o?l4@=O$yaNxv4PgSA$!{*$0)}_ru*{mEa6Ofz1SaP*4!QKCoAHkkI+b91AC^mV9FL39y z&H{C)MGM?v?Jm44!rW`rx!P~t3jlrQbRJC7;;Vppt!ppsS6m=C+1|zzD27Z^>2I`qvO^^;{2F)L%FL z!R-cJ)D3mik_*(ude{!ysEt{|Mwecf=$P%aq#eSDK5~A>5X0^#km2s757)AHYzC#6 z&~h%6E`BK-#}FWfJl~Lg&Cm3+6mNok#i^XkH>3RtRs1Ed0p8*#Zip^)$&CxdIWe9k zg$WyAnblI9je43iRMLuTF0*^LdvM@(EjkrKD=lgBDD~t$0A0QZ0$dAC%Pg5Fl~3*A z*Fr8g!|lCoDMT!HS8E_IyYt5gg5Gi!t+gY+#G3E~0-tWWeZ96q6Mj_*wl4YC2rOkW`UI-k#i1Lb$j;A0^ba0#b2R_} diff --git a/MdManzerAlam_Task3/website.py b/MdManzerAlam_Task3/website.py deleted file mode 100644 index a32edaf9..00000000 --- a/MdManzerAlam_Task3/website.py +++ /dev/null @@ -1,48 +0,0 @@ -from flask import Flask,render_template,request,redirect,url_for -from pymongo import MongoClient -from flask_paginate import Pagination, get_page_args - -app = Flask(__name__) -# app.debug = True - -@app.route("/") -def home(): - return render_template("index.html") - -@app.route("/results/") -def result(): - client = MongoClient('localhost',27017) - database = client.gluggle - collection = database.queries - search_string = request.args.get('query') - print(search_string) - search_result = [] - required = [] - search_result = collection.find( - {"$text": {"$search": search_string, '$caseSensitive': False}}) - for object in search_result: - exist = False - for result in required: - if result['title'] == object['title'] or result['url'] == object['url']: - exist = True - - break - - if exist == False: - required.append(object) - page, per_page, offset = get_page_args(page_parameter='page', - per_page_parameter='per_page') - total = len(required) - - pagination = Pagination(page=page, per_page=per_page, total=total, - css_framework='bootstrap4') - return render_template('result.html', - required=required[offset:offset+per_page], - page=page, - per_page=per_page, - pagination=pagination, - q=search_string - ) - -if __name__ == "__main__": - app.run(debug=True) \ No newline at end of file diff --git a/MdManzerAlam_Task4/__pycache__/website.cpython-39.pyc b/MdManzerAlam_Task4/__pycache__/website.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..286c290d63304fcf0aac27918a34c19cee8e3e17 GIT binary patch literal 3027 zcmZWrTaO$$6)w9jU0r>d&SWy#kX;cVNVLRG?}|WJ6a`QwyFwyE#LQDEtEQ&y>8a_u z^tjx)w5p|*5(GiY1Mf)8NWAO|Kgdr=@RH}{g+*U?IYvnLo$$N3n%FXObz8bHdtaCNK zYS&qd*CdnIUb1*yt{+_&(J>P((SGHi=RMI8-B(V0L#&7%zTdJMSD>*fu3C+o@|swi zaP<$le(=7yCf47O4c@lXzy1~{b^5;XKh2VAVpfzagj7Q<^J$i7X&Oo%&Sa%csH8}h z9BI>-scg7csy;KVM`bZCA7rU4bf25(aWYPeM5kq8I%BDa(_}1%i5gd1rdjE7`m9u< ziv8)VnCL!J6b-D>q3)qBC$dQAk~$ujN=vm3D{`s6W`hw~dGXJR2q?b$#5FJ zHIGu*|9oR#=NUGjfPU$yEa z%F$qFvwZO3XW49=$-yJ>C@JPr{W44P;ST)tqro$|TcuiVOkWs}mh6!+WR=4e$sZZoh0Li6RJC*D96Jj~vm-8;VA?(Kj@<=AzhARshLJ!wjzYnYakAG;H)}+C!T-Sa zm~e&nDy;b?^|(5E>AZH%#vP3N7skDsq8w})$24Ry&eCfCl4WGEhjNuHdk@tZwte1% zCG3yRQ>MI{{h1$PjdLG<$qnE5$aoYq#44h=4sBqUIxASkME9_j9+mm@{#ED-@^SGJaM*lghZT%{235BDG42$^=VWO$(qM>Ddz8 zO*>{Uh+9FCfNcDw6Izf}o7VJA=hAuO=yY1C;?_Cnt9X4c%>Z~QhL1k~ z&gTaUUNd}uw{UB>23T)6PaU-ZiTlnCN4R%^>U&PnJL~b5JyzSd+E32Lg5}s@`>f|g z`>=iJ;Tktyf|xX}fe1zOl_Mh2dgB7X?JeW$RAeREg7!pikX&>)kvj4QdzvE zp84wLZ?+!Z`p&HjpG!C*C7QpDV)+Bx;aHYG6d_{iZ)O4|t68R(n6VsU{T|)}4eW~K z{vFc2foguU0+|efm&rvGFST|+jP69fAS!(-RVC5(r@+4kwut9M-y?QCoPl12>6}~& zU;*2!KJ9Vs)Tnd#f=^Gk`~gNQ3b=#FhOEt2dC%5sXsM6UKW`3}N2x9|DG?PBs15l# z{6b7B_)ES(@DCWVzyV)yVy7ctxZovM2ek{{@pOZ}40mwhqXabvTY0q$_6h#NQzjY| z!jz@#m>)F}NZZ=uqNSsQ)*c^&*%pnu0cibt5ek3NtOLkDTtsyw+Bcj!qpieOYqaFDsqCPU*5E zE2POERY{k1e_2By9LN zxX7wI#-ogB!YNWPNqru7Z4R9#D!~d;#jB@dP&n<2cFR(Or;_QNzNhYDIrURo)7Pa= zvbZyZ@Svkdte_gsCKG(SQWpEB&pRi(qk-4kLw!Mq8|T}tf6 zm(P({mRIWhz(KjUM;!hMt@$&m&VCmx&szPTpuc*8%Ha_U@V4z;^B7*v8QyQacf7$F z>UaDd>HV+W$%MmUo}|TaxP!dq z3%NTR_dhWH9{ryw%X{=2Vg0Xsl8Cg`&#AJZ>RL^L4~0=Ky-IG5y&}^Sc(i<0WMv}a z7PU8k#_6oGDJ<^Tp3^@uMX4LfR-n7)vUPS+>4BwmtvO5`$WwZ@!9)uaL>9pLfUHEm&h%2k;dF`E$rfZH)Q_< D^5*M( literal 0 HcmV?d00001 diff --git a/MdManzerAlam_Task3/crawler/Crawler.py b/MdManzerAlam_Task4/crawler/Crawler.py similarity index 100% rename from MdManzerAlam_Task3/crawler/Crawler.py rename to MdManzerAlam_Task4/crawler/Crawler.py diff --git a/MdManzerAlam_Task4/hello.py b/MdManzerAlam_Task4/hello.py new file mode 100644 index 00000000..3ff3043e --- /dev/null +++ b/MdManzerAlam_Task4/hello.py @@ -0,0 +1,2 @@ +import nltk +nltk.download() \ No newline at end of file diff --git a/MdManzerAlam_Task3/main.py b/MdManzerAlam_Task4/main.py similarity index 100% rename from MdManzerAlam_Task3/main.py rename to MdManzerAlam_Task4/main.py diff --git a/MdManzerAlam_Task3/models/query.py b/MdManzerAlam_Task4/models/query.py similarity index 100% rename from MdManzerAlam_Task3/models/query.py rename to MdManzerAlam_Task4/models/query.py diff --git a/MdManzerAlam_Task3/static/css/index.css b/MdManzerAlam_Task4/static/css/index.css similarity index 100% rename from MdManzerAlam_Task3/static/css/index.css rename to MdManzerAlam_Task4/static/css/index.css diff --git a/MdManzerAlam_Task3/static/css/results.css b/MdManzerAlam_Task4/static/css/results.css similarity index 100% rename from MdManzerAlam_Task3/static/css/results.css rename to MdManzerAlam_Task4/static/css/results.css diff --git a/MdManzerAlam_Task3/templates/index.html b/MdManzerAlam_Task4/templates/index.html similarity index 100% rename from MdManzerAlam_Task3/templates/index.html rename to MdManzerAlam_Task4/templates/index.html diff --git a/MdManzerAlam_Task3/templates/result.html b/MdManzerAlam_Task4/templates/result.html similarity index 100% rename from MdManzerAlam_Task3/templates/result.html rename to MdManzerAlam_Task4/templates/result.html diff --git a/MdManzerAlam_Task4/website.py b/MdManzerAlam_Task4/website.py new file mode 100644 index 00000000..e967139a --- /dev/null +++ b/MdManzerAlam_Task4/website.py @@ -0,0 +1,111 @@ +from flask import Flask,render_template,request,redirect,url_for +from pymongo import MongoClient +from flask_paginate import Pagination, get_page_args +import nltk +import pymongo +nltk.download('stopwords') +nltk.download('punkt') +from nltk.corpus import stopwords +from nltk.tokenize import word_tokenize +from nltk.stem.porter import PorterStemmer +import os +import string +app = Flask(__name__) +# app.debug = True + +@app.route("/") +def home(): + return render_template("index.html") + +def search_string_optimizations(search_string): + # lowercasing the serch_results + search_string = search_string.lower() + + # remove punctuations from the search_results + translator = str.maketrans('', '', string.punctuation) + search_string = search_string.translate(translator) + + # removing stopwords and tokenization from the search_results + stop_words = set(stopwords.words("english")) + word_tokens = word_tokenize(search_string) + filtered_search_string = [ + word for word in word_tokens if word not in stop_words] + + # performing stemming in the search_results + stemmer = PorterStemmer() + word_tokens = word_tokenize(search_string) + stems = [stemmer.stem(word) for word in word_tokens] + return stems + +def sort_rank(required, optimized_res): + for result in required: + for word in optimized_res: + if word in result['title']: + result['score'] += 2 + else: + result['score'] += 0 + if word in result['description']: + result['score'] += 1 + else: + result['score'] += 0 + print('DONE ! DONE ! DONE') + return sorted(required, key=lambda result: result['score'], reverse=True) + +@app.route("/results/") +def result(): + client = MongoClient('localhost',27017) + database = client.gluggle + collection = database.queries + search_string = request.args.get('query') + optimized_res = search_string_optimizations(search_string) + print(search_string) + search_result = [] + required = [] + search_results = collection.find( + { + "$text": { + "$search": search_string, + '$caseSensitive': False + } + }, + { + "score": { + '$meta' : "textScore" + } + }). sort( + [ + ('sort', {'$meta': 'textScore'}), + ('_id', pymongo.DESCENDING) + ] + ) + + for object in search_results: + exist = False + for result in required: + if result['title'] == object['title'] or result['url'] == object['url']: + exist = True + break + + if exist == False: + # print(dir(object)) + required.append(object) + # print(required) + + required = sort_rank(required, optimized_res) + + page, per_page, offset = get_page_args( + page_parameter='page', per_page_parameter='per_page') + total = search_results.count() + + pagination = Pagination(page=page, per_page=per_page, total=total, + css_framework='bootstrap4') + return render_template('result.html', + required=required[offset:offset+per_page], + page=page, + per_page=per_page, + pagination=pagination, + q=search_string + ) + +if __name__ == "__main__": + app.run(debug=True) \ No newline at end of file