Skip to content

Commit

Permalink
Merge pull request #327 from DagsHub/create-datasource-url-fix
Browse files Browse the repository at this point in the history
Fix URL checking of create_datasource
  • Loading branch information
kbolashev authored Jul 23, 2023
2 parents b6fa8f8 + ca89558 commit 487cbb6
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 20 deletions.
24 changes: 5 additions & 19 deletions dagshub/data_engine/datasources.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
import logging
import urllib.parse
from typing import Optional, Union, List

from dagshub.common.analytics import send_analytics_event
from dagshub.common.api.repo import RepoAPI
from dagshub.data_engine.client.data_client import DataClient
from dagshub.data_engine.client.models import DatasourceType
from dagshub.data_engine.model.datasource import Datasource
from dagshub.data_engine.model.datasource_state import DatasourceState
from dagshub.data_engine.model.datasource_state import DatasourceState, DatasourceType, path_regexes
from dagshub.data_engine.model.errors import DatasourceNotFoundError

logger = logging.getLogger(__name__)
Expand All @@ -28,24 +26,12 @@ def create_datasource(repo: str, name: str, path: str, revision: Optional[str] =
:return The created datasource
"""

parsed = urllib.parse.urlparse(path)

if parsed.scheme == "" \
and parsed.hostname == "" \
and parsed.query == "" \
and parsed.params == "" \
and parsed.fragment == "":
return create_from_repo(repo, name, path=parsed.path, revision=revision)

elif parsed.scheme != "" and parsed.hostname != "":
# Bucket URL
if parsed.query != "" or parsed.params != "" or parsed.fragment != "":
raise ValueError("Invalid bucket URL: ", path)
if revision != "":
if path_regexes[DatasourceType.BUCKET].fullmatch(path):
if revision is not None:
raise ValueError("revision cannot be used together with bucket URLs")
return create_from_bucket(repo, name, bucket_url=path)

raise ValueError("Invalid path used, format could not be determined: ", path)
else:
return create_from_repo(repo, name, path=path, revision=revision)


create = create_datasource
Expand Down
2 changes: 1 addition & 1 deletion dagshub/data_engine/model/datasource_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
logger = logging.getLogger(__name__)

path_regexes = {
DatasourceType.BUCKET: re.compile(r"(?P<schema>s3|gs)://(?P<bucket>[\w\-]+)(?P<prefix>/.*)?"),
DatasourceType.BUCKET: re.compile(r"(?P<schema>s3|gs)://(?P<bucket>[\w\-._]+)(?P<prefix>/.*)?"),
DatasourceType.REPOSITORY: re.compile(r"repo://(?P<user>[\w\-_.]+)/(?P<repo>[\w\-_.]+)(?P<prefix>/.*)?"),
}

Expand Down
1 change: 1 addition & 0 deletions tests/data_engine/test_datasource_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def test_repo_regex_incorrect(in_str):
("s3://bucket", "s3", "bucket", None),
("gs://bucket/prefix", "gs", "bucket", "/prefix"),
("s3://bucket/longer/prefix", "s3", "bucket", "/longer/prefix"),
("s3://bucket_with.weird-chars/longer/prefix", "s3", "bucket_with.weird-chars", "/longer/prefix"),
]
)
def test_bucket_regex(in_str, schema, bucket, prefix):
Expand Down

0 comments on commit 487cbb6

Please sign in to comment.