Skip to content

Commit

Permalink
Merge branch 'main' into parallel-execution-optimizations
Browse files Browse the repository at this point in the history
  • Loading branch information
dreadatour authored Sep 17, 2024
2 parents 44b6d0e + 2c8cefc commit 04f1902
Show file tree
Hide file tree
Showing 94 changed files with 3,311 additions and 5,023 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ repos:
- id: trailing-whitespace
exclude: '^LICENSES/'
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: 'v0.6.3'
rev: 'v0.6.5'
hooks:
- id: ruff
args: [--fix, --exit-non-zero-on-fix]
Expand Down
9 changes: 6 additions & 3 deletions README.rst
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
================
|logo| DataChain
================

|PyPI| |Python Version| |Codecov| |Tests|

.. |logo| image:: docs/assets/datachain.svg
:height: 24
.. |PyPI| image:: https://img.shields.io/pypi/v/datachain.svg
:target: https://pypi.org/project/datachain/
:alt: PyPI
Expand All @@ -13,9 +19,6 @@
:target: https://github.com/iterative/datachain/actions/workflows/tests.yml
:alt: Tests

AI 🔗 DataChain
----------------

DataChain is a modern Pythonic data-frame library designed for artificial intelligence.
It is made to organize your unstructured data into datasets and wrangle it at scale on
your local machine. Datachain does not abstract or hide the AI models and API calls, but helps to integrate them into the postmodern data stack.
Expand Down
1 change: 1 addition & 0 deletions docs/assets/datachain-white.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file removed docs/assets/datachain.png
Binary file not shown.
24 changes: 24 additions & 0 deletions docs/assets/datachain.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion docs/index.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# 🔗 DataChain Getting Started
# Get Started with DataChain

🔨Wrangle unstructured AI data at scale

Expand Down
8 changes: 7 additions & 1 deletion examples/computer_vision/iptc_exif_xmp_lib.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
# pip install defusedxml
"""
To install the required dependencies:
pip install datachain[examples]
"""

import json

from PIL import (
Expand Down
8 changes: 7 additions & 1 deletion examples/computer_vision/llava2_image_desc_lib.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
# pip install accelerate torch
"""
To install the required dependencies:
pip install datachain[examples]
"""

import torch
from transformers import (
AutoProcessor,
Expand Down
2 changes: 0 additions & 2 deletions examples/get_started/json-csv-reader.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
# pip install datamodel-code-generator jmespath

from typing import Optional

from pydantic import BaseModel
Expand Down
7 changes: 6 additions & 1 deletion examples/get_started/torch-loader.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
# pip install Pillow torchvision
"""
To install the required dependencies:
pip install datachain[torch]
"""

import os
from posixpath import basename
Expand Down
8 changes: 6 additions & 2 deletions examples/get_started/udfs/stateful.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
"""
To install dependencies:
To install the required dependencies:
pip install open_clip_torch
pip install datachain[examples]
"""

import os

os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

import open_clip

from datachain import C, DataChain, Mapper
Expand Down
19 changes: 10 additions & 9 deletions examples/multimodal/clip_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,23 @@
from torch.utils.data import DataLoader

from datachain import C, DataChain
from datachain.sql.functions import path

source = "gs://datachain-demo/50k-laion-files/000000/00000000*"


def create_dataset():
imgs = (
DataChain.from_storage(source, type="image")
.filter(C("file.path").glob("*.jpg"))
.map(stem=lambda file: file.get_file_stem(), params=["file"], output=str)
imgs = DataChain.from_storage(source, type="image").filter(
C("file.path").glob("*.jpg")
)
captions = (
DataChain.from_storage(source, type="text")
.filter(C("file.path").glob("*.txt"))
.map(stem=lambda file: file.get_file_stem(), params=["file"], output=str)
captions = DataChain.from_storage(source, type="text").filter(
C("file.path").glob("*.txt")
)
return imgs.merge(
captions,
on=path.file_stem(imgs.c("file.path")),
right_on=path.file_stem(captions.c("file.path")),
)
return imgs.merge(captions, on="stem")


if __name__ == "__main__":
Expand Down
23 changes: 11 additions & 12 deletions examples/multimodal/wds.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os

from datachain import C, DataChain
from datachain import DataChain
from datachain.lib.webdataset import process_webdataset
from datachain.lib.webdataset_laion import WDSLaion, process_laion_meta
from datachain.sql.functions import path
Expand All @@ -16,7 +16,7 @@
)

wds_images = (
DataChain.from_storage(IMAGE_TARS)
DataChain.from_storage(IMAGE_TARS, type="image")
.settings(cache=True)
.gen(laion=process_webdataset(spec=WDSLaion), params="file")
)
Expand All @@ -25,21 +25,20 @@
DataChain.from_parquet(PARQUET_METADATA)
.settings(cache=True)
.merge(wds_images, on="uid", right_on="laion.json.uid", inner=True)
.mutate(stem=path.file_stem(C("source.file.path")))
)

res = (
wds_npz = (
DataChain.from_storage(NPZ_METADATA)
.settings(cache=True)
.gen(emd=process_laion_meta)
.mutate(stem=path.file_stem(C("emd.file.path")))
.merge(
wds_with_pq,
on=["stem", "emd.index"],
right_on=["stem", "source.index"],
inner=True,
)
.save("wds")
)


res = wds_npz.merge(
wds_with_pq,
on=[path.file_stem(wds_npz.c("emd.file.path")), "emd.index"],
right_on=[path.file_stem(wds_with_pq.c("source.file.path")), "source.index"],
inner=True,
).save("wds")

res.show(5)
8 changes: 4 additions & 4 deletions mkdocs.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
site_name: DataChain
site_url: https://datachain.dvc.ai
site_name: ''
site_url: https://docs.datachain.ai
site_description: Wrangle unstructured AI data at scale

repo_url: "https://github.com/iterative/datachain"
Expand All @@ -15,8 +15,8 @@ validation:

theme:
name: material
logo: assets/datachain.png
favicon: assets/datachain.png
logo: assets/datachain-white.svg
favicon: assets/datachain.svg
icon:
repo: fontawesome/brands/github
features:
Expand Down
1 change: 1 addition & 0 deletions noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def tests(session: nox.Session) -> None:
"--cov-report=xml",
"--durations=10",
"--numprocesses=logical",
"--dist=loadgroup",
*session.posargs,
env={"COVERAGE_FILE": f".coverage.{session.python}"},
)
Expand Down
8 changes: 5 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ vector = [
]
hf = [
"numba>=0.60.0",
"datasets[audio,vision]"
"datasets[audio,vision]>=2.21.0"
]
tests = [
"datachain[torch,remote,vector,hf]",
Expand All @@ -82,7 +82,6 @@ tests = [
"pytest-mock>=3.12.0",
"pytest-servers[all]>=0.5.5",
"pytest-benchmark[histogram]",
"pytest-asyncio>=0.23.2",
"pytest-xdist>=3.3.1",
"virtualenv",
"dulwich",
Expand Down Expand Up @@ -136,13 +135,16 @@ markers = [
"llm_and_nlp: LLM and NLP examples",
"multimodal: Multimodal examples"
]
asyncio_mode = "auto"
filterwarnings = [
"error::pandas.errors.PerformanceWarning",
"error::pydantic.warnings.PydanticDeprecatedSince20",
"error::pytest_mock.PytestMockWarning",
"error::pytest.PytestCollectionWarning",
"error::sqlalchemy.exc.SADeprecationWarning",
"ignore::DeprecationWarning:timm.*",
"ignore::DeprecationWarning:botocore.auth",
"ignore::DeprecationWarning:datasets.utils._dill",
"ignore::DeprecationWarning:librosa.core.intervals",
"ignore:Field name .* shadows an attribute in parent:UserWarning" # datachain.lib.feature
]

Expand Down
13 changes: 4 additions & 9 deletions src/datachain/asyn.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,16 @@
import asyncio
from collections.abc import Awaitable, Coroutine, Iterable
from collections.abc import AsyncIterable, Awaitable, Coroutine, Iterable, Iterator
from concurrent.futures import ThreadPoolExecutor
from heapq import heappop, heappush
from typing import (
Any,
Callable,
Generic,
Optional,
TypeVar,
)
from typing import Any, Callable, Generic, Optional, TypeVar

from fsspec.asyn import get_loop

ASYNC_WORKERS = 20

InputT = TypeVar("InputT", contravariant=True) # noqa: PLC0105
ResultT = TypeVar("ResultT", covariant=True) # noqa: PLC0105
T = TypeVar("T")


class AsyncMapper(Generic[InputT, ResultT]):
Expand Down Expand Up @@ -226,7 +221,7 @@ async def _break_iteration(self) -> None:
self._push_result(self._next_yield, None)


def iter_over_async(ait, loop):
def iter_over_async(ait: AsyncIterable[T], loop) -> Iterator[T]:
"""Wrap an asynchronous iterator into a synchronous one"""
ait = ait.__aiter__()

Expand Down
1 change: 0 additions & 1 deletion src/datachain/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ class UniqueId:
etag: str
version: str = ""
is_latest: bool = True
vtype: str = ""
location: Optional[str] = None
last_modified: datetime = TIME_ZERO

Expand Down
Loading

0 comments on commit 04f1902

Please sign in to comment.