diff --git a/.gitignore b/.gitignore index b9e07b8..8f4c56a 100644 --- a/.gitignore +++ b/.gitignore @@ -130,3 +130,6 @@ dmypy.json output/ wikihow2zim/assets/vendor/ + +# Visual Studio Code configuration which is not maintained +.vscode \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 8b62e09..b23e422 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,7 @@ requests>=2.28.0,<3.0 Jinja2>=3.1.2,<4.0 zimscraperlib>=2.0.0,<2.1 -css-beautify>=1.10.3<2.0 -six>=1.16.0,<2.0 # css-beautify dependency +cssbeautifier>=1.10.3,<2.0 kiwixstorage>=0.8.1,<0.9 pif>=0.8.2,<0.9 tld>=0.12.6,<0.13 diff --git a/wikihow2zim/scraper.py b/wikihow2zim/scraper.py index 91835c8..f27409b 100644 --- a/wikihow2zim/scraper.py +++ b/wikihow2zim/scraper.py @@ -512,15 +512,17 @@ def scrape_categories(self): logger.info("Scraping expected category pages") for category in self.expected_categories: self.scrape_category(category) + if self.conf.delay: + time.sleep(self.conf.delay) def scrape_category(self, category: str): logger.info(f"> Category:{category}") nb_pages = self.scrape_category_page(category, page_num=1) if nb_pages > 1: for page_num in range(2, nb_pages + 1): - self.scrape_category_page(category, page_num=page_num) if self.conf.delay: time.sleep(self.conf.delay) + self.scrape_category_page(category, page_num=page_num) def scrape_category_page(self, category: str, page_num: int): category_url = f"/{self.metadata['category_prefix']}:{category}" diff --git a/wikihow2zim/utils.py b/wikihow2zim/utils.py index bd4ceb9..6fd4e18 100644 --- a/wikihow2zim/utils.py +++ b/wikihow2zim/utils.py @@ -268,7 +268,7 @@ def write(line): end = line.index(")") # check whether it's quoted or not - if line[start + 1] in ("'", '"'): + if line[start] in ("'", '"'): start += 1 end -= 1