Skip to content

Commit

Permalink
Merge pull request #429 from mojavelinux/issue-425-bypass-nb_hits_upd…
Browse files Browse the repository at this point in the history
…ater

resolves #425 bypass nb_hit_updater if UPDATE_NB_HITS is set or term is not a tty
  • Loading branch information
Sylvain Pace authored Mar 22, 2019
2 parents ca08e70 + 73ed0bc commit c21c801
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 6 deletions.
9 changes: 7 additions & 2 deletions scraper/src/config/config_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
"""

from collections import OrderedDict
from distutils.util import strtobool
import json
import os
import copy
Expand Down Expand Up @@ -49,6 +50,7 @@ class ConfigLoader(object):
strategy = 'default'
strict_redirect = True
strip_chars = u".,;:§¶"
update_nb_hits = None
use_anchors = False
user_agent = 'Algolia DocSearch Crawler'
only_content_level = False
Expand Down Expand Up @@ -112,6 +114,9 @@ def _parse(self):
# Parse Env
self.app_id = os.environ.get('APPLICATION_ID', None)
self.api_key = os.environ.get('API_KEY', None)
self.update_nb_hits = os.environ.get('UPDATE_NB_HITS', None)
if self.update_nb_hits is not None:
self.update_nb_hits = bool(strtobool(self.update_nb_hits))

# Parse config
self.selectors = SelectorsParser().parse(self.selectors)
Expand All @@ -124,15 +129,15 @@ def _parse(self):
self.allowed_domains = UrlsParser.build_allowed_domains(
self.start_urls, self.stop_urls)

def update_nb_hits(self, nb_hits):
def update_nb_hits_value(self, nb_hits):
if self.config_file is not None:
# config loaded from file
previous_nb_hits = None if 'nb_hits' not in self.config_content else \
self.config_content['nb_hits']
nb_hit_updater = NbHitsUpdater(self.config_file,
self.config_content,
previous_nb_hits, nb_hits)
nb_hit_updater.update()
nb_hit_updater.update(self.update_nb_hits)

def get_extra_facets(self):
return UrlsParser.get_extra_facets(self.start_urls)
12 changes: 9 additions & 3 deletions scraper/src/config/nb_hits_updater.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from ..helpers import confirm
import json
import copy
import sys


class NbHitsUpdater(object):
Expand All @@ -16,12 +17,17 @@ def __init__(self, config_file, config_content, previous_nb_hits,
self.new_nb_hit = new_nb_hit
self.previous_nb_hits = previous_nb_hits

def update(self):
def update(self, perform_update):
if self._update_needed():
print("previous nb_hits: " + str(self.previous_nb_hits) + "\n")

if confirm(
'Do you want to update the nb_hits in ' + self.config_file + ' ?'):
if perform_update is None:
if sys.stdout.isatty():
perform_update = confirm('Do you want to update the nb_hits in ' + self.config_file + ' ?')
else:
perform_update = True

if perform_update:
try:
self._update_config()
print("\n[OK] " + self.config_file + " has been updated")
Expand Down
2 changes: 1 addition & 1 deletion scraper/src/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def run_config(config):
if DocumentationSpider.NB_INDEXED > 0:
algolia_helper.commit_tmp_index()
print('Nb hits: ' + str(DocumentationSpider.NB_INDEXED))
config.update_nb_hits(DocumentationSpider.NB_INDEXED)
config.update_nb_hits_value(DocumentationSpider.NB_INDEXED)
else:
print('Crawling issue: nbHits 0 for ' + config.index_name)
algolia_helper.report_crawling_issue()
Expand Down

0 comments on commit c21c801

Please sign in to comment.