Skip to content

Commit

Permalink
Merge pull request #1487 from Sefaria/trellojob
Browse files Browse the repository at this point in the history
Trellojob
  • Loading branch information
akiva10b authored Jul 4, 2023
2 parents 21c9277 + 8dfddf4 commit 71b5713
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 68 deletions.
75 changes: 17 additions & 58 deletions scripts/webpages_cronjob.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,67 +22,25 @@ def run_job(test=True, board_id="", idList_mapping={}):
webpages_without_websites_days = sites_that_may_have_removed_linker_days # same timeline is relevant

print("Original webpage stats...")
orig_total_pages, orig_total_links, year_data = webpages_stats()

orig_count = WebPageSet().count()
skip = 0
limit = 500
print("Cleaning webpages...")
clean_webpages(test=test)
dedupe_webpages(test=test)
total_pages, total_links, year_data = webpages_stats()

post_object = {
"username": "Webpage Cronjob",
"channel": "#engineering-signal",
"blocks": [
{
"type": "header",
"text": {
"type": "plain_text",
"text": "Webpage Cronjob",
"emoji": True
}
},
{
"type": "section",
"fields": [
{
"type": "mrkdwn",
"text": f"*Original webpage total:*\n{orig_total_pages}"
},
{
"type": "mrkdwn",
"text": f"*Webpage total after running cronjob:*\n{total_pages}"
}
]
},
{
"type": "section",
"fields": [
{
"type": "mrkdwn",
"text": f"*Original link total:*\n{orig_total_links}"
},
{
"type": "mrkdwn",
"text": f"*Link total after running cronjob:*\n{total_links}"
}
]
},
]
}
requests.post(SLACK_URL, json=post_object)
print("Find sites that no longer have linker...")
sites["Linker uninstalled"] = find_sites_that_may_have_removed_linker(last_linker_activity_day=sites_that_may_have_removed_linker_days)
print("Looking for webpages that have no corresponding website. If WebPages have been accessed in last 20 days, create a new WebSite for them. Otherwise, delete them.")
sites["Site uses linker but is not whitelisted"] = find_webpages_without_websites(test=test, hit_threshold=50, last_linker_activity_day=webpages_without_websites_days)

#
# flag = 500
# print("Looking for websites where the same Ref appears in at least {} pages...".format(flag))
# sites["Websites that may need exclusions set"] = find_sites_to_be_excluded_relative(relative_percent=3)
while (skip + limit) < orig_count:
webpages = WebPageSet(limit=limit, skip=skip)
print("Deduping...")
dedupe_webpages(webpages, test=test)
print("Looking for webpages that have no corresponding website. If WebPages have been accessed in last 20 days, create a new WebSite for them. Otherwise, delete them.")
sites["Site uses linker but is not whitelisted"] = find_webpages_without_websites(webpages, test=test, hit_threshold=50, last_linker_activity_day=webpages_without_websites_days)
sites["Websites that may need exclusions set"] = find_sites_to_be_excluded_relative(webpages, relative_percent=3)
skip += limit

print(f"Removed {WebPageSet().count() - orig_count} pages")

# given list type and site, either create new card or update existing card with message of site object
print("****")
print(sites)
for kind, sites_to_handle in sites.items():
print(f"{kind} -> {sites_to_handle}")
Expand All @@ -94,11 +52,12 @@ def run_job(test=True, board_id="", idList_mapping={}):
site_name_on_trello = site_on_trello['name']
if site_name_in_DB == site_name_on_trello:
already_on_trello = True
board.add_comment(site_on_trello, comment)
if not test:
board.add_comment(site_on_trello, comment)
break
if not already_on_trello:
card = board.create_card(site_name_in_DB, idList_mapping[kind])
board.add_comment(card, comment)
if not already_on_trello and not test:
card = board.create_card(site_name_in_DB, idList_mapping[kind])
board.add_comment(card, comment)


class TrelloBoard:
Expand Down
24 changes: 14 additions & 10 deletions sefaria/model/webpage.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,12 @@ def __eq__(self, other):
return self.__key() == other.__key()
return NotImplemented

def get_num_webpages(self):
if getattr(self, 'num_webpages', None) is None:
self.num_webpages = WebPageSet({"url": {"$regex": "|".join(website.domains)}})
self.save()
return self.num_webpages


class WebSiteSet(abst.AbstractMongoSet):
recordClass = WebSite
Expand Down Expand Up @@ -373,11 +379,10 @@ def test_normalization():
print("{} pages normalized".format(count))


def dedupe_webpages(test=True):
def dedupe_webpages(webpages, test=True):
"""Normalizes URLs of all webpages and deletes multiple entries that normalize to the same URL"""
norm_count = 0
dedupe_count = 0
webpages = WebPageSet()
for i, webpage in tqdm(enumerate(webpages)):
norm = WebPage.normalize_url(webpage.url)
if webpage.url != norm:
Expand Down Expand Up @@ -429,7 +434,7 @@ def dedupe_identical_urls(test=True):
"count": -1
}
}
], allowDiskUse=True);
], allowDiskUse=True)

url_count = 0
removed_count = 0
Expand Down Expand Up @@ -523,9 +528,8 @@ def webpages_stats():
return (total_pages, total_links, year_data)


def find_webpages_without_websites(test=True, hit_threshold=50, last_linker_activity_day=20):
def find_webpages_without_websites(webpages, test=True, hit_threshold=50, last_linker_activity_day=20):
from datetime import datetime, timedelta
webpages = WebPageSet()
new_active_sites = Counter() # WebSites we don't yet have in DB, but we have corresponding WebPages accessed recently
unactive_unacknowledged_sites = {} # WebSites we don't yet have in DB, and we have correpsonding WebPages but they have not been accessed recently

Expand Down Expand Up @@ -560,10 +564,10 @@ def find_webpages_without_websites(test=True, hit_threshold=50, last_linker_acti

return sites_added

def find_sites_to_be_excluded():
def find_sites_to_be_excluded(webpages):
# returns all sites dictionary and each entry has a Counter of refs
all_sites = {}
for i, webpage in tqdm(enumerate(WebPageSet())):
for webpage in tqdm(webpages):
website = webpage.get_website(dict_only=True)
if website != {}:
if website["name"] not in all_sites:
Expand All @@ -585,10 +589,10 @@ def find_sites_to_be_excluded_absolute(flag=100):
sites_to_exclude[website] += f"{website} may need exclusions set due to Ref {common[0]} with {common[1]} pages.\n"
return sites_to_exclude

def find_sites_to_be_excluded_relative(flag=25, relative_percent=3):
def find_sites_to_be_excluded_relative(webpages, flag=25, relative_percent=3):
# this function looks for any website which has more webpages than 'flag' of any ref AND the amount of pages of this ref is a significant percentage of site's total refs
sites_to_exclude = defaultdict(list)
all_sites = find_sites_to_be_excluded()
all_sites = find_sites_to_be_excluded(webpages)
for website in all_sites:
total = sum(all_sites[website].values())
top_10 = all_sites[website].most_common(10)
Expand Down Expand Up @@ -659,7 +663,7 @@ def find_sites_that_may_have_removed_linker(last_linker_activity_day=20):
if not website.linker_installed:
keep = False
print(f"Alert! {domain} has removed the linker!")
sites_to_delete[domain] = f"{domain} has {website.num_webpages} pages, but has not used the linker in {last_linker_activity_day} days. {webpage.url} is the newest page."
sites_to_delete[domain] = f"{domain} has {website.get_num_webpages()} pages, but has not used the linker in {last_linker_activity_day} days. {webpage.url} is the newest page."
else:
print("Alert! Can't find website {} corresponding to webpage {}".format(data["name"], webpage.url))
webpages_without_websites += 1
Expand Down

0 comments on commit 71b5713

Please sign in to comment.