diff --git a/llm-complete-guide/pipelines/distilabel_generation.py b/llm-complete-guide/pipelines/distilabel_generation.py index 18ddc5d8..4fb38906 100644 --- a/llm-complete-guide/pipelines/distilabel_generation.py +++ b/llm-complete-guide/pipelines/distilabel_generation.py @@ -18,6 +18,7 @@ EMBEDDINGS_MODEL_NAME_ZENML, ) from steps.distilabel_generate_queries import generate_synthetic_queries +from steps.eval_pii import eval_pii from steps.hf_dataset_loader import load_hf_dataset from steps.push_to_argilla import push_to_argilla from steps.push_to_hf import push_to_hf @@ -47,16 +48,22 @@ @pipeline(model=model_definition) def generate_synthetic_data(): train_dataset, test_dataset = load_hf_dataset() + _, _, _ = eval_pii( + train_dataset=train_dataset, + test_dataset=test_dataset, + ) train_with_queries, test_with_queries = generate_synthetic_queries( train_dataset=train_dataset, test_dataset=test_dataset ) push_to_hf( train_dataset=train_with_queries, test_dataset=test_with_queries, + after="eval_pii", ) push_to_argilla( train_dataset=train_with_queries, test_dataset=test_with_queries, + after="eval_pii", ) diff --git a/llm-complete-guide/requirements.txt b/llm-complete-guide/requirements.txt index 506d03cc..13563b92 100644 --- a/llm-complete-guide/requirements.txt +++ b/llm-complete-guide/requirements.txt @@ -21,6 +21,7 @@ rerankers[flashrank] datasets torch gradio +huggingface-hub # optional requirements for S3 artifact store # s3fs>2022.3.0 diff --git a/llm-complete-guide/steps/distilabel_generate_queries.py b/llm-complete-guide/steps/distilabel_generate_queries.py index 0b23a639..55372d1b 100644 --- a/llm-complete-guide/steps/distilabel_generate_queries.py +++ b/llm-complete-guide/steps/distilabel_generate_queries.py @@ -45,7 +45,7 @@ def generate_synthetic_queries( with Pipeline(name="generate_embedding_queries") as pipeline: load_dataset = LoadDataFromHub( - # num_examples=20, # use this for demo purposes + num_examples=40, # use this for demo purposes output_mappings={"page_content": "anchor"}, ) generate_sentence_pair = GenerateSentencePair( diff --git a/llm-complete-guide/steps/eval_pii.py b/llm-complete-guide/steps/eval_pii.py new file mode 100644 index 00000000..b81237f3 --- /dev/null +++ b/llm-complete-guide/steps/eval_pii.py @@ -0,0 +1,329 @@ +import io +import re +from collections import defaultdict +from typing import Annotated, Dict, List, Tuple, Union + +import matplotlib.pyplot as plt +from datasets import Dataset +from PIL import Image +from zenml import log_artifact_metadata, step + + +class PIIDetector: + """A class to detect PII in HuggingFace datasets.""" + + def __init__(self): + # Email regex pattern + self.email_pattern = re.compile( + r""" + (?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*") + @ + (?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\]) + """, + re.VERBOSE | re.IGNORECASE, + ) + + # Phone number patterns (US formats) + self.phone_pattern = re.compile( + r""" + (?: + # Format: (123) 456-7890 or 123-456-7890 + (?:\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4}))| + # Format: +1 123-456-7890 or +1 (123) 456-7890 + (?:\+1[-.\s]?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4}))| + # Format: 1234567890 + (?:[0-9]{10}) + ) + """, + re.VERBOSE, + ) + + # SSN pattern (XXX-XX-XXXX) + self.ssn_pattern = re.compile( + r""" + (?!000|666|9\d{2}) # SSN cannot start with 000, 666, or 900-999 + ([0-8]\d{2}|7([0-6]\d)) + [-\s]? + (?!00) # Cannot have 00 in the middle group + ([0-9]{2}) + [-\s]? + (?!0000) # Cannot end with 0000 + ([0-9]{4}) + """, + re.VERBOSE, + ) + + # Credit card pattern (major card types) + self.credit_card_pattern = re.compile( + r""" + (?: + # Visa + 4[0-9]{12}(?:[0-9]{3})?| + # Mastercard + (?:5[1-5][0-9]{2}|222[1-9]|22[3-9][0-9]|2[3-6][0-9]{2}|27[01][0-9]|2720)[0-9]{12}| + # American Express + 3[47][0-9]{13}| + # Discover + 6(?:011|5[0-9][0-9])[0-9]{12} + ) + """, + re.VERBOSE, + ) + + # IP address pattern (IPv4) + self.ip_pattern = re.compile( + r""" + \b + (?: + (?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\. + (?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\. + (?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\. + (?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?) + ) + \b + """, + re.VERBOSE, + ) + + # Date pattern (common formats) + self.date_pattern = re.compile( + r""" + (?: + # MM/DD/YYYY or MM-DD-YYYY + (?:0[1-9]|1[0-2])[/-](?:0[1-9]|[12][0-9]|3[01])[/-](?:19|20)\d\d| + # YYYY/MM/DD or YYYY-MM-DD + (?:19|20)\d\d[/-](?:0[1-9]|1[0-2])[/-](?:0[1-9]|[12][0-9]|3[01])| + # Month DD, YYYY + (?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?| + Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?| + Dec(?:ember)?)\s+(?:0[1-9]|[12][0-9]|3[01])(?:,|\s)+(?:19|20)\d\d + ) + """, + re.VERBOSE | re.IGNORECASE, + ) + + def find_pii(self, text: str) -> Dict[str, List[str]]: + """ + Find all PII in a given text. + + Args: + text (str): The text to search for PII + + Returns: + Dict[str, List[str]]: Dictionary of PII types and their findings + """ + if not isinstance(text, str): + return { + "emails": [], + "phones": [], + "ssns": [], + "credit_cards": [], + "dates": [], + "ips": [], + } + + return { + "emails": self.email_pattern.findall(text), + "phones": self.phone_pattern.findall(text), + "ssns": self.ssn_pattern.findall(text), + "credit_cards": self.credit_card_pattern.findall(text), + "dates": self.date_pattern.findall(text), + "ips": self.ip_pattern.findall(text), + } + + def scan_dataset( + self, + dataset: Dataset, + columns: Union[List[str], None] = None, + max_samples: int = None, + ) -> Dict[str, Dict]: + """Scan a HuggingFace dataset for PII (currently only emails). + + Args: + dataset (Dataset): HuggingFace dataset to scan + columns (List[str], optional): Specific columns to scan. If None, scans all string columns + max_samples (int, optional): Maximum number of samples to scan. If None, scans entire dataset + + Returns: + Dict[str, Dict]: Dictionary containing: + - 'statistics': Overall statistics about the scan + - 'findings': Detailed findings per column + """ + # Initialize results + results = { + "statistics": { + "total_samples_scanned": 0, + "columns_scanned": 0, + "total_findings": { + "emails": 0, + "phones": 0, + "ssns": 0, + "credit_cards": 0, + "dates": 0, + "ips": 0, + }, + }, + "findings": defaultdict(list), + } + + # Determine which columns to scan + if columns is None: + # Get all columns that contain string data + columns = [ + col + for col in dataset.column_names + if dataset.features[col].dtype in ["string", "str"] + ] + + results["statistics"]["columns_scanned"] = len(columns) + + # Determine number of samples to scan + n_samples = ( + len(dataset) + if max_samples is None + else min(max_samples, len(dataset)) + ) + results["statistics"]["total_samples_scanned"] = n_samples + + # Scan the dataset + for idx in range(n_samples): + sample = dataset[idx] + + for column in columns: + if column not in sample: + continue + + text = sample[column] + pii_findings = self.find_pii(text) + + # Check if any PII was found + if any(findings for findings in pii_findings.values()): + # Update statistics + for pii_type, findings in pii_findings.items(): + results["statistics"]["total_findings"][pii_type] += ( + len(findings) + ) + + # Record detailed findings + results["findings"][column].append( + {"index": idx, "findings": pii_findings} + ) + + return results + + +def plot_pii_results( + train_results: Dict[str, Dict], test_results: Dict[str, Dict] +) -> Image: + total_findings = { + "Emails": ( + train_results["statistics"]["total_findings"]["emails"] + + test_results["statistics"]["total_findings"]["emails"] + ), + "Phone Numbers": ( + train_results["statistics"]["total_findings"]["phones"] + + test_results["statistics"]["total_findings"]["phones"] + ), + "SSNs": ( + train_results["statistics"]["total_findings"]["ssns"] + + test_results["statistics"]["total_findings"]["ssns"] + ), + "Credit Cards": ( + train_results["statistics"]["total_findings"]["credit_cards"] + + test_results["statistics"]["total_findings"]["credit_cards"] + ), + "Dates": ( + train_results["statistics"]["total_findings"]["dates"] + + test_results["statistics"]["total_findings"]["dates"] + ), + "IP Addresses": ( + train_results["statistics"]["total_findings"]["ips"] + + test_results["statistics"]["total_findings"]["ips"] + ), + } + + plt.figure(figsize=(10, 8)) + labels = [f"{k}\n({v})" for k, v in total_findings.items() if v > 0] + values = [v for v in total_findings.values() if v > 0] + + if values: # Only create pie chart if there are findings + plt.pie(values, labels=labels, autopct="%1.1f%%") + plt.title("Distribution of PII Findings in Dataset") + else: + plt.text( + 0.5, + 0.5, + "No PII Found", + horizontalalignment="center", + verticalalignment="center", + ) + + # Convert plot to PIL Image + buf = io.BytesIO() + plt.savefig(buf, format="png", bbox_inches="tight") + buf.seek(0) + plt.close() # Clean up matplotlib figure + return Image.open(buf) + + +@step +def eval_pii( + train_dataset: Dataset, test_dataset: Dataset +) -> Tuple[ + Annotated[Dict[str, Dict], "train_pii_results"], + Annotated[Dict[str, Dict], "test_pii_results"], + Annotated[Image.Image, "PII chart"], +]: + detector = PIIDetector() + train_results = detector.scan_dataset( + dataset=train_dataset, + # columns=[ + # "text" + # ], # specify columns to scan, or None for all string columns + # max_samples=1000, # optional: limit number of samples to scan + ) + test_results = detector.scan_dataset( + dataset=test_dataset, + # columns=["text"], + # max_samples=1000, # optional: limit number of samples to scan + ) + + train_metadata = { + "samples_scanned": train_results["statistics"][ + "total_samples_scanned" + ], + "emails_found": train_results["statistics"]["total_findings"][ + "emails" + ], + "phones_found": train_results["statistics"]["total_findings"][ + "phones" + ], + "ssns_found": train_results["statistics"]["total_findings"]["ssns"], + "credit_cards_found": train_results["statistics"]["total_findings"][ + "credit_cards" + ], + "dates_found": train_results["statistics"]["total_findings"]["dates"], + "ips_found": train_results["statistics"]["total_findings"]["ips"], + } + log_artifact_metadata( + metadata=train_metadata, artifact_name="train_pii_results" + ) + + test_metadata = { + "samples_scanned": test_results["statistics"]["total_samples_scanned"], + "emails_found": test_results["statistics"]["total_findings"]["emails"], + "phones_found": test_results["statistics"]["total_findings"]["phones"], + "ssns_found": test_results["statistics"]["total_findings"]["ssns"], + "credit_cards_found": test_results["statistics"]["total_findings"][ + "credit_cards" + ], + "dates_found": test_results["statistics"]["total_findings"]["dates"], + "ips_found": test_results["statistics"]["total_findings"]["ips"], + } + log_artifact_metadata( + metadata=test_metadata, artifact_name="test_pii_results" + ) + + pii_chart = plot_pii_results(train_results, test_results) + + return train_results, test_results, pii_chart