From d939a8622020be016a85483c3ec6c21b655e6469 Mon Sep 17 00:00:00 2001 From: Anurag Date: Mon, 14 Oct 2024 00:28:14 +0530 Subject: [PATCH 1/4] refactor: md changes documents -> items --- docs/concepts/pipelines.md | 4 ++-- docs/examples/custom-parsing.md | 4 ++-- docs/examples/mining-product-reviews.md | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/concepts/pipelines.md b/docs/concepts/pipelines.md index ed0e4cae..bc1d1b3b 100644 --- a/docs/concepts/pipelines.md +++ b/docs/concepts/pipelines.md @@ -1,6 +1,6 @@ # Pipelines -Pipelines in DocETL are the core structures that define the flow of data processing. They orchestrate the application of operators to datasets, creating a seamless workflow for complex document processing tasks. +Pipelines in DocETL are the core structures that define the flow of data processing. They orchestrate the application of operators to datasets, creating a seamless workflow for complex chunk processing tasks. ## Components of a Pipeline @@ -21,7 +21,7 @@ default_model: gpt-4o-mini ### Datasets -Datasets define the input data for your pipeline. They are collections of documents, where each document is an object in a JSON list (or row in a CSV file). Datasets are typically specified in the YAML configuration file, indicating the type and path of the data source. For example: +Datasets define the input data for your pipeline. They are collections of items/chunks, where each item/chunk is an object in a JSON list (or row in a CSV file). Datasets are typically specified in the YAML configuration file, indicating the type and path of the data source. For example: ```yaml datasets: diff --git a/docs/examples/custom-parsing.md b/docs/examples/custom-parsing.md index a8959f97..9b4d1b3a 100644 --- a/docs/examples/custom-parsing.md +++ b/docs/examples/custom-parsing.md @@ -283,7 +283,7 @@ While DocETL provides several built-in parsing tools, the community can always b If the built-in tools don't meet your needs, you can create your own custom parsing tools. Here's how: 1. Define your parsing function in the `parsing_tools` section of your configuration. -2. Ensure your function takes a document (dict) as input and returns a list of documents (dicts). +2. Ensure your function takes a item (dict) as input and returns a list of items (dicts). 3. Use your custom parser in the `parsing` section of your dataset configuration. For example: @@ -292,7 +292,7 @@ For example: parsing_tools: - name: my_custom_parser function_code: | - def my_custom_parser(document: Dict) -> List[Dict]: + def my_custom_parser(item: Dict) -> List[Dict]: # Your custom parsing logic here return [processed_data] diff --git a/docs/examples/mining-product-reviews.md b/docs/examples/mining-product-reviews.md index 9e120d85..c6edf3ec 100644 --- a/docs/examples/mining-product-reviews.md +++ b/docs/examples/mining-product-reviews.md @@ -16,7 +16,7 @@ Our goal is to create a pipeline that will: 2. Resolve similar themes across different games 3. Generate reports of polarizing themes common across games, supported by quotes from different game reviews -We'll be using a subset of the [STEAM review dataset](https://www.kaggle.com/datasets/andrewmvd/steam-reviews). We've created a subset that contains reviews for 500 of the most popular games, with approximately 400 reviews per game, balanced between positive and negative ratings. For each game, we concatenate all reviews into a single text for analysis---so we'll have 500 input documents, each representing a game. You can get the dataset sample [here](https://drive.google.com/file/d/1hroljsvn8m23iVsNpET8Ma7sfb1OUu_u/view?usp=drive_link). +We'll be using a subset of the [STEAM review dataset](https://www.kaggle.com/datasets/andrewmvd/steam-reviews). We've created a subset that contains reviews for 500 of the most popular games, with approximately 400 reviews per game, balanced between positive and negative ratings. For each game, we concatenate all reviews into a single text for analysis---so we'll have 500 input items/reviews, each representing a game. You can get the dataset sample [here](https://drive.google.com/file/d/1hroljsvn8m23iVsNpET8Ma7sfb1OUu_u/view?usp=drive_link). ## Pipeline Structure @@ -284,7 +284,7 @@ This command, with `optimize: true` set for the map and resolve operations, prov 2. Blocking statements and thresholds for the resolve operation: This optimizes the theme resolution process, making it more efficient when dealing with a large number of themes across multiple games. The optimizer provided us with blocking keys of `summary` and `theme`, and a threshold of 0.596 for similarity (to get 95% recall of duplicates). -These optimizations are crucial for handling the scale of our dataset, which includes 500 games with an _average_ of 66,000 tokens per game, and 12% of the documents exceeding the context length limits of the OpenAI LLMs (128k tokens). +These optimizations are crucial for handling the scale of our dataset, which includes 500 games with an _average_ of 66,000 tokens per game, and 12% of the items/reviews exceeding the context length limits of the OpenAI LLMs (128k tokens). ??? info "Optimized Pipeline" From bcd4e8cb9f67aa7f5861be69a459b68385e347c9 Mon Sep 17 00:00:00 2001 From: Anurag Date: Mon, 14 Oct 2024 00:28:21 +0530 Subject: [PATCH 2/4] refactor: code changes documents -> items --- docetl/operations/clustering_utils.py | 12 ++++++------ docetl/operations/reduce.py | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docetl/operations/clustering_utils.py b/docetl/operations/clustering_utils.py index 7663b892..05fed867 100644 --- a/docetl/operations/clustering_utils.py +++ b/docetl/operations/clustering_utils.py @@ -60,8 +60,8 @@ def get_embeddings_for_clustering_with_st( return embeddings, 0 -def cluster_documents( - documents: List[Dict], +def cluster_items( + items: List[Dict], sampling_config: Dict, sample_size: int, api_wrapper: APIWrapper, @@ -70,7 +70,7 @@ def cluster_documents( Cluster documents using KMeans clustering algorithm. Args: - documents (List[Dict]): The list of documents to cluster. + items (List[Dict]): The list of documents to cluster. sampling_config (Dict): The sampling configuration. Must contain embedding_model. If embedding_keys is not specified, it will use all keys in the document. If embedding_model is not specified, it will use text-embedding-3-small. If embedding_model is sentence-transformer, it will use all-MiniLM-L6-v2. sample_size (int): The number of clusters to create. api_wrapper (APIWrapper): The API wrapper to use for embedding. @@ -78,17 +78,17 @@ def cluster_documents( Dict[int, List[Dict]]: A dictionary of clusters, where each cluster is a list of documents. """ embeddings, cost = get_embeddings_for_clustering( - documents, sampling_config, api_wrapper + items, sampling_config, api_wrapper ) from sklearn.cluster import KMeans - num_clusters = min(sample_size, len(documents)) + num_clusters = min(sample_size, len(items)) kmeans = KMeans(n_clusters=num_clusters, random_state=42) cluster_labels = kmeans.fit_predict(embeddings) clusters = {i: [] for i in range(num_clusters)} for idx, label in enumerate(cluster_labels): - clusters[label].append(documents[idx]) + clusters[label].append(items[idx]) return clusters, cost diff --git a/docetl/operations/reduce.py b/docetl/operations/reduce.py index cd3cee78..5d7f5012 100644 --- a/docetl/operations/reduce.py +++ b/docetl/operations/reduce.py @@ -20,7 +20,7 @@ from docetl.operations.base import BaseOperation from docetl.operations.clustering_utils import ( - cluster_documents, + cluster_items, get_embeddings_for_clustering, ) from docetl.operations.utils import rich_as_completed @@ -428,7 +428,7 @@ def _cluster_based_sampling( if sample_size >= len(group_list): return group_list, 0 - clusters, cost = cluster_documents( + clusters, cost = cluster_items( group_list, value_sampling, sample_size, self.runner.api ) From 283f79b59f37e75ecf894e9d9bc617e34345f285 Mon Sep 17 00:00:00 2001 From: Anurag Date: Mon, 14 Oct 2024 13:23:26 +0530 Subject: [PATCH 3/4] refactor: operators.md documents -> items --- docs/concepts/operators.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/concepts/operators.md b/docs/concepts/operators.md index fc9070ef..617e4499 100644 --- a/docs/concepts/operators.md +++ b/docs/concepts/operators.md @@ -4,7 +4,7 @@ Operators in DocETL are designed for semantically processing unstructured data. ## Overview -- Datasets contain documents, where a document is an object in the JSON list, with fields and values. +- Datasets contain items, where a item is an object in the JSON list, with fields and values. An item here could be simple text chunk or a document reference. - DocETL provides several operators, each tailored for specific unstructured data processing tasks. - By default, operations are parallelized on your data using multithreading for improved performance. From 6e0f2bd3ea50322e02f3758cd66573d1452a192b Mon Sep 17 00:00:00 2001 From: Anurag Date: Mon, 14 Oct 2024 13:28:52 +0530 Subject: [PATCH 4/4] Revert "refactor: code changes documents -> items" This reverts commit bcd4e8cb9f67aa7f5861be69a459b68385e347c9. --- docetl/operations/clustering_utils.py | 12 ++++++------ docetl/operations/reduce.py | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docetl/operations/clustering_utils.py b/docetl/operations/clustering_utils.py index 05fed867..7663b892 100644 --- a/docetl/operations/clustering_utils.py +++ b/docetl/operations/clustering_utils.py @@ -60,8 +60,8 @@ def get_embeddings_for_clustering_with_st( return embeddings, 0 -def cluster_items( - items: List[Dict], +def cluster_documents( + documents: List[Dict], sampling_config: Dict, sample_size: int, api_wrapper: APIWrapper, @@ -70,7 +70,7 @@ def cluster_items( Cluster documents using KMeans clustering algorithm. Args: - items (List[Dict]): The list of documents to cluster. + documents (List[Dict]): The list of documents to cluster. sampling_config (Dict): The sampling configuration. Must contain embedding_model. If embedding_keys is not specified, it will use all keys in the document. If embedding_model is not specified, it will use text-embedding-3-small. If embedding_model is sentence-transformer, it will use all-MiniLM-L6-v2. sample_size (int): The number of clusters to create. api_wrapper (APIWrapper): The API wrapper to use for embedding. @@ -78,17 +78,17 @@ def cluster_items( Dict[int, List[Dict]]: A dictionary of clusters, where each cluster is a list of documents. """ embeddings, cost = get_embeddings_for_clustering( - items, sampling_config, api_wrapper + documents, sampling_config, api_wrapper ) from sklearn.cluster import KMeans - num_clusters = min(sample_size, len(items)) + num_clusters = min(sample_size, len(documents)) kmeans = KMeans(n_clusters=num_clusters, random_state=42) cluster_labels = kmeans.fit_predict(embeddings) clusters = {i: [] for i in range(num_clusters)} for idx, label in enumerate(cluster_labels): - clusters[label].append(items[idx]) + clusters[label].append(documents[idx]) return clusters, cost diff --git a/docetl/operations/reduce.py b/docetl/operations/reduce.py index 5d7f5012..cd3cee78 100644 --- a/docetl/operations/reduce.py +++ b/docetl/operations/reduce.py @@ -20,7 +20,7 @@ from docetl.operations.base import BaseOperation from docetl.operations.clustering_utils import ( - cluster_items, + cluster_documents, get_embeddings_for_clustering, ) from docetl.operations.utils import rich_as_completed @@ -428,7 +428,7 @@ def _cluster_based_sampling( if sample_size >= len(group_list): return group_list, 0 - clusters, cost = cluster_items( + clusters, cost = cluster_documents( group_list, value_sampling, sample_size, self.runner.api )