Skip to content

Commit

Permalink
add submodule URL candidates by URL-rewrite to match parent protocol
Browse files Browse the repository at this point in the history
  • Loading branch information
bpinsard committed May 28, 2024
1 parent 864dc4a commit b8759bc
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 47 deletions.
80 changes: 42 additions & 38 deletions datalad/distribution/get.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,74 +11,72 @@
"""

import logging
import re

import os.path as op
import re

from datalad.config import ConfigManager
from datalad.core.distributed.clone import clone_dataset
from datalad.distribution.dataset import (
Dataset,
EnsureDataset,
datasetmethod,
require_dataset,
)
from datalad.distribution.utils import (
_get_flexible_source_candidates,
rewrite_match_scheme,
)
from datalad.interface.base import (
Interface,
build_doc,
eval_results,
)
from datalad.interface.base import build_doc
from datalad.interface.common_opts import (
jobs_opt,
location_description,
reckless_opt,
recursion_flag,
)
from datalad.interface.results import (
annexjson2result,
get_status_dict,
is_ok_dataset,
results_from_annex_noinfo,
results_from_paths,
annexjson2result,
success_status_map,
results_from_annex_noinfo,
)
from datalad.interface.common_opts import (
recursion_flag,
location_description,
jobs_opt,
reckless_opt,
)
from datalad.interface.results import is_ok_dataset
from datalad.local.subdatasets import Subdatasets
from datalad.support.annexrepo import AnnexRepo
from datalad.support.collections import ReadOnlyDict
from datalad.support.constraints import (
EnsureInt,
EnsureChoice,
EnsureStr,
EnsureInt,
EnsureNone,
)
from datalad.support.collections import ReadOnlyDict
from datalad.support.param import Parameter
from datalad.support.annexrepo import AnnexRepo
from datalad.support.gitrepo import (
GitRepo,
_fixup_submodule_dotgit_setup,
EnsureStr,
)
from datalad.support.exceptions import (
CapturedException,
CommandError,
InsufficientArgumentsError,
)
from datalad.support.gitrepo import (
GitRepo,
_fixup_submodule_dotgit_setup,
)
from datalad.support.network import (
URL,
RI,
URL,
urlquote,
)
from datalad.support.parallel import (
ProducerConsumerProgressLog,
)
from datalad.support.parallel import ProducerConsumerProgressLog
from datalad.support.param import Parameter
from datalad.utils import (
unique,
Path,
get_dataset_root,
shortened_repr,
unique,
)

from datalad.local.subdatasets import Subdatasets

from datalad.distribution.dataset import (
Dataset,
EnsureDataset,
datasetmethod,
require_dataset,
)
from datalad.core.distributed.clone import clone_dataset
from datalad.distribution.utils import _get_flexible_source_candidates

__docformat__ = 'restructuredtext'

lgr = logging.getLogger('datalad.distribution.get')
Expand Down Expand Up @@ -115,6 +113,9 @@ def _get_flexible_source_candidates_for_submodule(ds, sm):
- A URL or absolute path recorded for git in `.gitmodules` (cost 600).
- In case the parent dataset url scheme doesnt match the submodule URL one,
attempt a rewrite to match the parent one (cost 610).
- URL of any configured superdataset remote that is known to have the
desired submodule commit, with the submodule path appended to it.
There can be more than one candidate (cost 650).
Expand Down Expand Up @@ -234,6 +235,9 @@ def _get_flexible_source_candidates_for_submodule(ds, sm):
remote_url,
alternate_suffix=False)
)
rewriten_url = rewrite_match_scheme(remote_url, sm_url)
if rewriten_url:
clone_urls.append(dict(cost=610, name=remote, url=rew_url))

cost_candidate_expr = re.compile('[0-9][0-9][0-9].*')
candcfg_prefix = 'datalad.get.subdataset-source-candidate-'
Expand Down Expand Up @@ -881,7 +885,7 @@ def __call__(
reckless=None,
jobs='auto',
):

if not (dataset or path):
raise InsufficientArgumentsError(
"Neither dataset nor target path(s) provided")
Expand Down
12 changes: 11 additions & 1 deletion datalad/distribution/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@
import os
from os.path import join as opj

from datalad.distribution.utils import _get_flexible_source_candidates
from datalad.distribution.utils import (
_get_flexible_source_candidates,
rewrite_match_scheme,
)
from datalad.support.gitrepo import GitRepo
from datalad.tests.utils_pytest import (
assert_raises,
Expand All @@ -26,6 +29,13 @@
)


def test_rewrite_match_scheme():
ssh_url = "[email protected]:user/repo.git"
https_url = "https://domain.com/user/repo.git"
assert rewrite_match_scheme(ssh_url, https_url) == ssh_url
assert rewrite_match_scheme(https_url, ssh_url) == https_url


@known_failure_windows
def test_get_flexible_source_candidates():
f = _get_flexible_source_candidates
Expand Down
25 changes: 17 additions & 8 deletions datalad/distribution/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,25 +11,34 @@
"""

import logging

from os.path import (
isabs,
join as opj,
normpath,
)
import posixpath
from os.path import isabs
from os.path import join as opj
from os.path import normpath

from giturlparse import parse as parse_git_url

from datalad.log import log_progress
from datalad.support.annexrepo import AnnexRepo
from datalad.support.network import (
PathRI,
RI,
URL,
PathRI,
)


lgr = logging.getLogger('datalad.distribution.utils')


def rewrite_match_scheme(target_url, src_url):
target_url = parse_git_url(target_url)
src_url = parse_git_url(src_url)

if hasattr(target_url, "protocol") and \
hasattr(src_url, "protocol") and \
target_url.protocol != src_url.protocol:
return src_url.urls[target_url.protocol]


def _get_flexible_source_candidates(src, base_url=None, alternate_suffix=True):
"""Get candidates to try cloning from.
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
'typing_extensions>=4.0.0; python_version < "3.11"',
'annexremote',
'looseversion',
"giturlparse",
],
'downloaders': [
'boto',
Expand Down

0 comments on commit b8759bc

Please sign in to comment.