From 476a9dd0634ba0602cb2624441054cbf3ff2f226 Mon Sep 17 00:00:00 2001 From: Valeriy Mukhtarulin Date: Thu, 24 Oct 2019 15:13:38 -0400 Subject: [PATCH] Simpler duplicates (#171) * One find to rule them all * Update docs * Add uniques to report_all * Changes --- CHANGES.md | 1 + docs/source/nbs/Rules.ipynb | 293 +--------------------- src/arche/arche.py | 23 +- src/arche/data_quality_report.py | 29 +-- src/arche/figures/tables.py | 9 - src/arche/quality_estimation_algorithm.py | 40 +-- src/arche/rules/duplicates.py | 100 ++++---- tests/rules/test_duplicates.py | 104 +++----- tests/test_arche.py | 5 +- 9 files changed, 118 insertions(+), 486 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 6f94d00..48eee23 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -19,6 +19,7 @@ Note that the top-most release is changes in the unreleased master branch on Git - Added `outcome` property on Result, in order to define a rule outcome based on message levells. #173 ### Changed - Reports rendering. Reports are being generated as HTML with a jinja2 template. `Arche.report_all()` displays the rules results grouped by outcome. The plots are displayed on the "plots" tab. #168 +- `report_all()` accepts `uniques` arg to find duplicates among columns/rows, #171 ## [0.3.6] (2019-07-12) diff --git a/docs/source/nbs/Rules.ipynb b/docs/source/nbs/Rules.ipynb index 5d7b7e2..7889285 100644 --- a/docs/source/nbs/Rules.ipynb +++ b/docs/source/nbs/Rules.ipynb @@ -1,292 +1 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Rules" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This notebook contains rules used in the library with examples. Some rules executed during `Arche.report_all()`, and some are meant to be executed separately.\n", - "\n", - "Some definitions here are used interchangeably:\n", - "\n", - "* Rule - a test case for data. As a test case, it can be failed, passed or skipped. Some of the rules output only information like [Category fields](#Category-fields)\n", - "\n", - "* **df** - a dataframe which holds input data (from a job, collection or other source)\n", - "\n", - "* Scrapy cloud item - a row in a **df**\n", - "\n", - "* Items fields - columns in a **df**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import arche\n", - "from arche import *\n", - "from arche.readers.items import Items" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "items = Items.from_df(pd.read_csv(\"https://raw.githubusercontent.com/scrapinghub/arche/master/docs/source/nbs/data/items_products_8.csv\"))\n", - "target_items = Items.from_df(pd.read_csv(\"https://raw.githubusercontent.com/scrapinghub/arche/master/docs/source/nbs/data/items_products_7.csv\"))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df = items.df.drop(columns=[\"_type\"])\n", - "target_df = target_items.df" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Accessing Graphs Data\n", - "The data is in `stats`. See `Result` class for more details." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "arche.rules.coverage.check_fields_coverage(df).stats" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Coverage\n", - "### Fields coverage on input data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "help(arche.rules.coverage.check_fields_coverage)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "arche.rules.coverage.check_fields_coverage(df).show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Anomalies" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "help(arche.rules.coverage.anomalies)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "res = arche.rules.coverage.anomalies(target=\"381798/2/4\", sample=[\"381798/2/8\", \"381798/2/7\", \"381798/2/6\"])\n", - "res.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Categories" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Category fields" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "help(arche.rules.category.get_categories)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "arche.rules.category.get_categories(df, max_uniques=200).show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Category coverage\n", - "In `report_all()`, these rules use `category` tag." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "help(arche.rules.category.get_coverage_per_category)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "arche.rules.category.get_coverage_per_category(df, [\"category\"]).show()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "help(arche.rules.category.get_difference)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "arche.rules.category.get_difference(df, target_df, [\"category\"]).show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Compare\n", - "### Fields" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "help(arche.rules.compare.fields)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "arche.rules.compare.fields(df, target_df, [\"part_number\", \"name\", \"uom\"]).show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Duplicates\n", - "### Find duplicates by columns (fields)\n", - "This rule is not included in `Arche.report_all()`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "help(arche.rules.duplicates.find_by)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "arche.rules.duplicates.find_by(df, [\"name\", \"part_number\"]).show(short=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.3" - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "state": {}, - "version_major": 2, - "version_minor": 0 - } - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} +{"cells":[{"cell_type":"markdown","metadata":{},"outputs":[],"source":["# Rules"]},{"cell_type":"markdown","metadata":{},"outputs":[],"source":["This notebook contains rules used in the library with examples. Some rules executed during `Arche.report_all()`, and some are meant to be executed separately.\n","\n","Some definitions here are used interchangeably:\n","\n","* Rule - a test case for data. As a test case, it can be failed, passed or skipped. Some of the rules output only information like [Category fields](#Category-fields)\n","\n","* **df** - a dataframe which holds input data (from a job, collection or other source)\n","\n","* Scrapy cloud item - a row in a **df**\n","\n","* Items fields - columns in a **df**"]},{"cell_type":"code","execution_count":1,"metadata":{},"outputs":[],"source":"import arche\nfrom arche import *\nfrom arche.readers.items import Items"},{"cell_type":"code","execution_count":2,"metadata":{},"outputs":[],"source":["items = Items.from_df(pd.read_csv(\"https://raw.githubusercontent.com/scrapinghub/arche/master/docs/source/nbs/data/items_products_8.csv\"))\n","target_items = Items.from_df(pd.read_csv(\"https://raw.githubusercontent.com/scrapinghub/arche/master/docs/source/nbs/data/items_products_7.csv\"))"]},{"cell_type":"code","execution_count":4,"metadata":{},"outputs":[],"source":["df = items.df.drop(columns=[\"_type\"])\n","target_df = target_items.df"]},{"cell_type":"markdown","metadata":{},"outputs":[],"source":["## Accessing Graphs Data\n","The data is in `stats`. See `Result` class for more details."]},{"cell_type":"code","execution_count":11,"metadata":{},"outputs":[],"source":["arche.rules.coverage.check_fields_coverage(df).stats"]},{"cell_type":"markdown","metadata":{},"outputs":[],"source":["## Coverage\n","### Fields coverage on input data"]},{"cell_type":"code","execution_count":12,"metadata":{},"outputs":[],"source":["help(arche.rules.coverage.check_fields_coverage)"]},{"cell_type":"code","execution_count":13,"metadata":{},"outputs":[],"source":["arche.rules.coverage.check_fields_coverage(df).show()"]},{"cell_type":"markdown","metadata":{},"outputs":[],"source":["### Anomalies"]},{"cell_type":"code","execution_count":14,"metadata":{},"outputs":[],"source":["help(arche.rules.coverage.anomalies)"]},{"cell_type":"code","execution_count":15,"metadata":{},"outputs":[],"source":["res = arche.rules.coverage.anomalies(target=\"381798/2/4\", sample=[\"381798/2/8\", \"381798/2/7\", \"381798/2/6\"])\n","res.show()"]},{"cell_type":"markdown","metadata":{},"outputs":[],"source":["## Categories"]},{"cell_type":"markdown","metadata":{},"outputs":[],"source":["### Category fields"]},{"cell_type":"code","execution_count":0,"metadata":{},"outputs":[],"source":["help(arche.rules.category.get_categories)"]},{"cell_type":"code","execution_count":0,"metadata":{},"outputs":[],"source":["arche.rules.category.get_categories(df, max_uniques=200).show()"]},{"cell_type":"markdown","metadata":{},"outputs":[],"source":["### Category coverage\n","In `report_all()`, these rules use `category` tag."]},{"cell_type":"code","execution_count":0,"metadata":{},"outputs":[],"source":["help(arche.rules.category.get_coverage_per_category)"]},{"cell_type":"code","execution_count":0,"metadata":{},"outputs":[],"source":["arche.rules.category.get_coverage_per_category(df, [\"category\"]).show()"]},{"cell_type":"code","execution_count":0,"metadata":{},"outputs":[],"source":["help(arche.rules.category.get_difference)"]},{"cell_type":"code","execution_count":0,"metadata":{},"outputs":[],"source":["arche.rules.category.get_difference(df, target_df, [\"category\"]).show()"]},{"cell_type":"markdown","metadata":{},"outputs":[],"source":["## Compare\n","### Fields"]},{"cell_type":"code","execution_count":0,"metadata":{},"outputs":[],"source":["help(arche.rules.compare.fields)"]},{"cell_type":"code","execution_count":0,"metadata":{},"outputs":[],"source":["arche.rules.compare.fields(df, target_df, [\"part_number\", \"name\", \"uom\"]).show()"]},{"cell_type":"markdown","metadata":{},"outputs":[],"source":"## Duplicates\n### Find duplicates by any combination of columns (fields)\nThis rule is executed when `uniques` is passed to `Arche.report_all()`."},{"cell_type":"code","execution_count":5,"metadata":{},"outputs":[],"source":["help(arche.rules.duplicates.find_by)"]},{"cell_type":"code","execution_count":8,"metadata":{},"outputs":[],"source":["arche.rules.duplicates.find_by(df, [\"uom\", [\"name\", \"part_number\"]]).show(short=True)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":""}],"nbformat":4,"nbformat_minor":2,"metadata":{"language_info":{"name":"python","codemirror_mode":{"name":"ipython","version":3}},"orig_nbformat":2,"file_extension":".py","mimetype":"text/x-python","name":"python","npconvert_exporter":"python","pygments_lexer":"ipython3","version":3}} \ No newline at end of file diff --git a/src/arche/arche.py b/src/arche/arche.py index 9e8e618..cf081fc 100755 --- a/src/arche/arche.py +++ b/src/arche/arche.py @@ -1,6 +1,6 @@ from functools import lru_cache import logging -from typing import Iterable, Optional, Union, cast +from typing import Iterable, List, Optional, Union, cast from arche.data_quality_report import DataQualityReport from arche.readers.items import Items, CollectionItems, JobItems, RawItems @@ -124,12 +124,21 @@ def get_items( def save_result(self, rule_result): self.report.save(rule_result) - def report_all(self, short: bool = False) -> None: + def report_all( + self, short: bool = False, uniques: List[Union[str, List[str]]] = None + ) -> None: + """Report on all included rules. + + Args: + uniques: see `arche.rules.duplicates.find_by` + """ + if uniques: + self.uniques = uniques self.run_all_rules() IPython.display.clear_output() self.report(keys_limit=10 if short else None) - def run_all_rules(self): + def run_all_rules(self) -> None: if isinstance(self.source_items, JobItems): self.check_metadata(self.source_items.job) if self.target_items: @@ -146,7 +155,6 @@ def data_quality_report(self, bucket: Optional[str] = None): IPython.display.clear_output() DataQualityReport(self.source_items, self.schema, self.report, bucket) - @lru_cache(maxsize=32) def run_general_rules(self): self.save_result(garbage_symbols(self.source_items.df)) df = self.source_items.df @@ -156,6 +164,10 @@ def run_general_rules(self): ) ) self.save_result(category_rules.get_categories(df)) + if getattr(self, "uniques", None): + self.save_result( + duplicate_rules.find_by(self.source_items.df, self.uniques) + ) def validate_with_json_schema(self) -> None: """Run JSON schema check and output results. It will try to find all errors, but @@ -208,8 +220,7 @@ def run_schema_rules(self) -> None: def run_customized_rules(self, items, tagged_fields): self.save_result(price_rules.compare_was_now(items.df, tagged_fields)) - self.save_result(duplicate_rules.find_by_unique(items.df, tagged_fields)) - self.save_result(duplicate_rules.find_by_name_url(items.df, tagged_fields)) + self.save_result(duplicate_rules.find_by_tags(items.df, tagged_fields)) self.save_result( category_rules.get_coverage_per_category( items.df, tagged_fields.get("category", []) + self.schema.enums diff --git a/src/arche/data_quality_report.py b/src/arche/data_quality_report.py index 9f70aa2..5f2fa61 100755 --- a/src/arche/data_quality_report.py +++ b/src/arche/data_quality_report.py @@ -49,14 +49,8 @@ def __init__( ) def create_figures(self, items: JobItems): - name_url_dups = self.report.results.get( - "Duplicates By **name_field, product_url_field** Tags", - duplicate_rules.find_by_name_url(items.df, self.schema.tags), - ) - - uniques = self.report.results.get( - "Duplicates By **unique** Tag", - duplicate_rules.find_by_unique(items.df, self.schema.tags), + dups = self.report.results.get( + "Duplicates", duplicate_rules.find_by_tags(items.df, self.schema.tags) ) price_was_now_result = price_rules.compare_was_now(items.df, self.schema.tags) @@ -80,10 +74,8 @@ def create_figures(self, items: JobItems): items.job, crawlera_user, validation_errors, - name_url_dups.err_items_count, - name_url_dups.items_count, - uniques.err_items_count, - uniques.items_count, + dups.err_items_count, + dups.items_count, no_of_price_warns, no_of_checked_price_items, tested=True, @@ -97,11 +89,8 @@ def create_figures(self, items: JobItems): validation_errors, self.schema.tags.get("name_field", ""), self.schema.tags.get("product_url_field", ""), - name_url_dups.items_count, - name_url_dups.err_items_count, - self.schema.tags.get("unique", []), - uniques.items_count, - uniques.err_items_count, + dups.err_items_count, + dups.items_count, self.schema.tags.get("product_price_field", ""), self.schema.tags.get("product_price_was_field", ""), no_of_checked_price_items, @@ -158,9 +147,6 @@ def rules_summary_table( url_field, no_of_checked_duplicated_items, no_of_duplicated_items, - unique, - no_of_checked_skus, - no_of_duplicated_skus, price_field, price_was_field, no_of_checked_price_items, @@ -175,9 +161,6 @@ def rules_summary_table( url_field, no_of_checked_duplicated_items, no_of_duplicated_items, - unique, - no_of_checked_skus, - no_of_duplicated_skus, price_field, price_was_field, no_of_checked_price_items, diff --git a/src/arche/figures/tables.py b/src/arche/figures/tables.py index 8dd1dc2..3b08f46 100755 --- a/src/arche/figures/tables.py +++ b/src/arche/figures/tables.py @@ -130,9 +130,6 @@ def rules_summary_table( url_field, no_of_checked_duplicated_items, no_of_duplicated_items, - unique, - no_of_checked_skus, - no_of_duplicated_skus, price_field, price_was_field, no_of_checked_price_items, @@ -150,12 +147,6 @@ def rules_summary_table( test_results_values.append(f"{no_of_duplicated_items} warnings") status_values.append(get_rule_status(no_of_duplicated_items)) - if no_of_checked_skus: - test_name_values.append("Duplicated Field Values") - tested_fields_values.append(unique) - test_results_values.append(f"{no_of_duplicated_skus} warnings") - status_values.append(get_rule_status(no_of_duplicated_skus)) - if no_of_checked_price_items: test_name_values.append("Prices comparison") tested_fields_values.append(f"{price_field}, {price_was_field}") diff --git a/src/arche/quality_estimation_algorithm.py b/src/arche/quality_estimation_algorithm.py index 3bdf48e..6f334ad 100755 --- a/src/arche/quality_estimation_algorithm.py +++ b/src/arche/quality_estimation_algorithm.py @@ -7,8 +7,6 @@ def generate_quality_estimation( no_of_validation_warnings, no_of_duplicated_items, checked_dup_items_count, - no_of_duplicated_skus, - no_of_checked_skus_items, no_of_price_warns, no_of_checked_price_items, tested, @@ -27,9 +25,6 @@ def generate_quality_estimation( duplicated_items_percent = float( get_duplicated_items_percent(no_of_duplicated_items, no_of_scraped_items) ) - duplicated_skus_percent = float( - get_duplicated_skus_percent(no_of_duplicated_skus, no_of_scraped_items) - ) crawlera_incapsula_percent = float(get_crawlera_incapsula_percent(crawlera_user)) @@ -45,13 +40,7 @@ def generate_quality_estimation( ) tested_percent = float(get_tested_percent(tested)) - if all( - [ - checked_dup_items_count == 0, - no_of_checked_skus_items == 0, - no_of_checked_price_items == 0, - ] - ): + if all([checked_dup_items_count == 0, no_of_checked_price_items == 0]): quality_estimation = ( adherence_to_schema_percent * 60 / 100 + crawlera_incapsula_percent * 8 / 100 @@ -60,7 +49,7 @@ def generate_quality_estimation( + response_status_count_percent * 7 / 100 + tested_percent * 15 / 100 ) - elif checked_dup_items_count == 0 and no_of_checked_skus_items == 0: + elif checked_dup_items_count == 0: quality_estimation = ( adherence_to_schema_percent * 55 / 100 + crawlera_incapsula_percent * 8 / 100 @@ -72,15 +61,14 @@ def generate_quality_estimation( ) elif checked_dup_items_count == 0 and no_of_checked_price_items == 0: quality_estimation = ( - adherence_to_schema_percent * 55 / 100 - + duplicated_skus_percent * 5 / 100 + adherence_to_schema_percent * 60 / 100 + crawlera_incapsula_percent * 8 / 100 + no_of_errors_percent * 5 / 100 + outcome_percent * 5 / 100 + response_status_count_percent * 7 / 100 + tested_percent * 15 / 100 ) - elif no_of_checked_skus_items == 0 and no_of_checked_price_items == 0: + elif no_of_checked_price_items == 0: quality_estimation = ( adherence_to_schema_percent * 50 / 100 + duplicated_items_percent * 10 / 100 @@ -92,19 +80,7 @@ def generate_quality_estimation( ) elif checked_dup_items_count == 0: quality_estimation = ( - adherence_to_schema_percent * 50 / 100 - + duplicated_skus_percent * 5 / 100 - + crawlera_incapsula_percent * 8 / 100 - + no_of_errors_percent * 5 / 100 - + price_was_price_now_comparison_percent * 5 / 100 - + outcome_percent * 5 / 100 - + response_status_count_percent * 7 / 100 - + tested_percent * 15 / 100 - ) - elif no_of_checked_skus_items == 0: - quality_estimation = ( - adherence_to_schema_percent * 45 / 100 - + duplicated_items_percent * 10 / 100 + adherence_to_schema_percent * 55 / 100 + crawlera_incapsula_percent * 8 / 100 + no_of_errors_percent * 5 / 100 + price_was_price_now_comparison_percent * 5 / 100 @@ -115,8 +91,7 @@ def generate_quality_estimation( elif no_of_checked_price_items == 0: quality_estimation = ( adherence_to_schema_percent * 45 / 100 - + duplicated_items_percent * 10 / 100 - + duplicated_skus_percent * 5 / 100 + + duplicated_items_percent * 15 / 100 + crawlera_incapsula_percent * 8 / 100 + no_of_errors_percent * 5 / 100 + outcome_percent * 5 / 100 @@ -126,8 +101,7 @@ def generate_quality_estimation( else: quality_estimation = ( adherence_to_schema_percent * 40 / 100 - + duplicated_items_percent * 10 / 100 - + duplicated_skus_percent * 5 / 100 + + duplicated_items_percent * 15 / 100 + crawlera_incapsula_percent * 8 / 100 + no_of_errors_percent * 5 / 100 + price_was_price_now_comparison_percent * 5 / 100 diff --git a/src/arche/rules/duplicates.py b/src/arche/rules/duplicates.py index 1c54962..1a7bd73 100755 --- a/src/arche/rules/duplicates.py +++ b/src/arche/rules/duplicates.py @@ -1,78 +1,64 @@ -from typing import List, Set +from collections import Iterable +from typing import Any, Generator, List, Union from arche.readers.schema import TaggedFields from arche.rules.result import Result, Outcome import pandas as pd -def find_by_unique(df: pd.DataFrame, tagged_fields: TaggedFields) -> Result: - """Verify if each item field tagged with `unique` is unique. +def find_by(df: pd.DataFrame, uniques: List[Union[str, List[str]]]) -> Result: + """Find equal items rows in `df` by `uniques`. I.e. if two items have the same + uniques's element value, they are considered duplicates. - Returns: - A result containing field names and keys for non unique items - """ - unique_fields = tagged_fields.get("unique", []) - result = Result("Duplicates By **unique** Tag") - - if not unique_fields: - result.outcome = Outcome.SKIPPED - return result - - err_keys: Set = set() - for field in unique_fields: - result.items_count = df[field].count() - duplicates = df[df.duplicated(field, keep=False)][[field]] - errors = {} - for _, d in duplicates.groupby([field]): - keys = list(d.index) - msg = f"same '{d[field].iloc[0]}' `{field}`" - errors[msg] = keys - err_keys = err_keys.union(keys) - if not duplicates.empty: - result.add_error( - f"{field} contains {len(duplicates[field].unique())} duplicated value(s)", - errors=errors, - ) - - return result - - -def find_by(df: pd.DataFrame, columns: List[str]) -> Result: - """Compare items rows in `df` by `columns` + Args: + uniques: list containing columns and list of columns to identify duplicates. + List of columns means that all list columns values should be equal. Returns: Any duplicates """ - result = Result(f"Duplicates") + result = Result("Duplicates") result.items_count = len(df) - df = df.dropna(subset=columns, how="all") - duplicates = df[df.duplicated(columns, keep=False)][columns] - if duplicates.empty: - return result - errors = {} - for _, d in duplicates.groupby(columns): - msgs = [f"'{d[c].iloc[0]}' `{c}`" for c in columns] - errors[f"same {', '.join(msgs)}"] = list(d.index) + df = df.dropna(subset=list(set(flatten(uniques))), how="all") + for columns in uniques: + mask = columns if isinstance(columns, list) else [columns] + duplicates = df[df.duplicated(columns, keep=False)][mask] + if duplicates.empty: + continue - result.add_error( - f"{len(duplicates)} duplicate(s) with same {', '.join(columns)}", errors=errors - ) + errors = {} + grouped = duplicates.groupby(columns) + for _, d in grouped: + msgs = [f"'{d[c].iloc[0]}' `{c}`" for c in mask] + errors[f"same {', '.join(msgs)}"] = list(d.index) + result.add_error( + f"{', '.join(mask)} contains {len(grouped)} duplicated value(s)", + errors=errors, + ) return result -def find_by_name_url(df: pd.DataFrame, tagged_fields: TaggedFields) -> Result: - """Check for items with the same name and url""" +def find_by_tags(df: pd.DataFrame, tagged_fields: TaggedFields) -> Result: + """Check for duplicates based on schema tags. In particular, look for items with + the same `name_field` and `product_url_field`, and for uniqueness among `unique` field""" name_fields = tagged_fields.get("name_field") url_fields = tagged_fields.get("product_url_field") - name = "Duplicates By **name_field, product_url_field** Tags" - result = Result(name) - if not name_fields or not url_fields: - result.outcome = Outcome.SKIPPED + columns_to_check: List = tagged_fields.get("unique", []) + if (not name_fields or not url_fields) and not columns_to_check: + result = Result("Duplicates") + result.add_info(Outcome.SKIPPED) return result - name_field = name_fields[0] - url_field = url_fields[0] - result = find_by(df, [name_field, url_field]) - result.name = name - return result + if name_fields and url_fields: + columns_to_check.extend([[name_fields[0], url_fields[0]]]) + + return find_by(df, columns_to_check) + + +def flatten(l: Any) -> Generator[str, None, None]: + for el in l: + if isinstance(el, Iterable) and not isinstance(el, str): + yield from flatten(el) + else: + yield el diff --git a/tests/rules/test_duplicates.py b/tests/rules/test_duplicates.py index a043bfa..bc7fece 100755 --- a/tests/rules/test_duplicates.py +++ b/tests/rules/test_duplicates.py @@ -6,75 +6,48 @@ import pytest -unique_inputs = [ - ( - {"id": ["0", "0", "1"]}, - {"unique": ["id"]}, - { - Level.ERROR: [ - ("id contains 1 duplicated value(s)", None, {"same '0' `id`": [0, 1]}) - ] - }, - ), - ( - { - "id": ["47" for x in range(6)], - "name": ["Walt", "Juan", "Juan", "Walt", "Walt", "John"], - }, - {"unique": ["id", "name"]}, - { - Level.ERROR: [ - ( - "id contains 1 duplicated value(s)", - None, - {"same '47' `id`": [i for i in range(6)]}, - ), - ( - "name contains 2 duplicated value(s)", - None, - {"same 'Juan' `name`": [1, 2], "same 'Walt' `name`": [0, 3, 4]}, - ), - ] - }, - ), - ({"name": ["a", "b"]}, {"unique": ["name"]}, {}), -] - - -@pytest.mark.parametrize("data, tagged_fields, expected_messages", unique_inputs) -def test_find_by_unique(data, tagged_fields, expected_messages): - df = pd.DataFrame(data) - assert_results_equal( - duplicates.find_by_unique(df, tagged_fields), - create_result( - "Duplicates By **unique** Tag", expected_messages, items_count=len(df) - ), - ) - - @pytest.mark.parametrize( "data, columns, expected_messages", [ + ({"id": ["0", "1", "2"]}, ["id"], {}), ( {"id": ["0", "0", "1"]}, ["id"], { Level.ERROR: [ - ("2 duplicate(s) with same id", None, {"same '0' `id`": [0, 1]}) + ( + "id contains 1 duplicated value(s)", + None, + {"same '0' `id`": [0, 1]}, + ) ] }, ), - ({"id": ["0", "1", "2"]}, ["id"], {}), ( - {"id": [np.nan, "9", "9"], "city": [np.nan, "Talca", "Talca"]}, - ["id", "city"], + { + "id": [np.nan, "9", "9"], + "city": [np.nan, "Talca", "Talca"], + "id2": ["47" for x in range(3)], + "name": ["Walt", "Juan", "Juan"], + }, + [["id", "city"], "id2", "name"], { Level.ERROR: [ ( - "2 duplicate(s) with same id, city", + "id, city contains 1 duplicated value(s)", None, {"same '9' `id`, 'Talca' `city`": [1, 2]}, - ) + ), + ( + "id2 contains 1 duplicated value(s)", + None, + {"same '47' `id2`": [0, 1, 2]}, + ), + ( + "name contains 1 duplicated value(s)", + None, + {"same 'Juan' `name`": [1, 2]}, + ), ] }, ), @@ -92,15 +65,24 @@ def test_find_by(data, columns, expected_messages): "data, tagged_fields, expected_messages", [ ( - {"name": ["bob", "bob", "bob", "bob"], "url": ["u1", "u1", "2", "u1"]}, - {"name_field": ["name"], "product_url_field": ["url"]}, + { + "name": ["bob", "bob", "bob", "bob"], + "url": ["u1", "u1", "2", "u1"], + "id": [np.nan, "9", "9", None], + }, + {"name_field": ["name"], "product_url_field": ["url"], "unique": ["id"]}, { Level.ERROR: [ ( - "3 duplicate(s) with same name, url", + "id contains 1 duplicated value(s)", + None, + {"same '9' `id`": [1, 2]}, + ), + ( + "name, url contains 1 duplicated value(s)", None, {"same 'bob' `name`, 'u1' `url`": [0, 1, 3]}, - ) + ), ] }, ), @@ -111,13 +93,9 @@ def test_find_by(data, columns, expected_messages): ), ], ) -def test_find_by_name_url(data, tagged_fields, expected_messages): +def test_find_by_tags(data, tagged_fields, expected_messages): df = pd.DataFrame(data) assert_results_equal( - duplicates.find_by_name_url(df, tagged_fields), - create_result( - "Duplicates By **name_field, product_url_field** Tags", - expected_messages, - items_count=len(df), - ), + duplicates.find_by_tags(df, tagged_fields), + create_result("Duplicates", expected_messages, items_count=len(df)), ) diff --git a/tests/test_arche.py b/tests/test_arche.py index 95ae0c9..6266706 100755 --- a/tests/test_arche.py +++ b/tests/test_arche.py @@ -134,8 +134,7 @@ def test_arche_dataframe(mocker): "JSON Schema Validation", "Tags", "Compare Price Was And Now", - "Duplicates By **unique** Tag", - "Duplicates By **name_field, product_url_field** Tags", + "Duplicates", "Coverage For Scraped Categories", "Category Coverage Difference", "Compare Prices For Same Urls", @@ -214,7 +213,7 @@ def test_run_all_rules_collection(mocker, get_collection_items): mocked_check_metadata.assert_not_called() mocked_compare_metadata.assert_not_called() - mocked_run_general_rules.assert_called_once_with() + mocked_run_general_rules.assert_called_once_with(arche) mocked_run_comparison_rules.assert_called_once_with() mocked_run_schema_rules.assert_called_once_with(arche)