Skip to content

Commit

Permalink
fix(ingest): use branch info when cloning git repos (#6937)
Browse files Browse the repository at this point in the history
  • Loading branch information
hsheth2 authored Jan 5, 2023
1 parent 9bb1c15 commit 8b1dc4b
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 25 deletions.
14 changes: 13 additions & 1 deletion metadata-ingestion/src/datahub/configuration/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ class GitHubReference(ConfigModel):
)
branch: str = Field(
"main",
description="Branch on which your files live by default. Typically main or master.",
description="Branch on which your files live by default. Typically main or master. This can also be a commit hash.",
)
base_url: str = Field(
"https://github.com",
Expand Down Expand Up @@ -73,3 +73,15 @@ def auto_infer_from_repo(cls, v: Optional[str], values: Dict[str, Any]) -> str:
if v is None:
return f"[email protected]:{values.get('repo')}"
return v

@property
def branch_for_clone(self) -> Optional[str]:
# If branch was manually set, we should use it. Otherwise return None.
# We do this because we want to use the default branch unless they override it.
# While our default for branch is "main", they could be using "master" or something else.
# It's ok if the URLs we generate are slightly incorrect, but changing branch to be
# required would be a breaking change.

if "branch" in self.__fields_set__:
return self.branch
return None
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@ def __init__(self, tmp_dir: str, skip_known_host_verification: bool = True):
self.skip_known_host_verification = skip_known_host_verification
self.last_repo_cloned: Optional[git.Repo] = None

def clone(self, ssh_key: Optional[SecretStr], repo_url: str) -> Path:
def clone(
self, ssh_key: Optional[SecretStr], repo_url: str, branch: Optional[str] = None
) -> Path:
unique_dir = str(uuid4())
keys_dir = f"{self.tmp_dir}/{unique_dir}/keys"
checkout_dir = f"{self.tmp_dir}/{unique_dir}/checkout"
Expand Down Expand Up @@ -58,6 +60,11 @@ def clone(self, ssh_key: Optional[SecretStr], repo_url: str) -> Path:
env=dict(GIT_SSH_COMMAND=git_ssh_cmd),
)
logger.info("✅ Cloning complete!")

if branch is not None:
logger.info(f"Checking out branch {branch}")
self.last_repo_cloned.git.checkout(branch)

return pathlib.Path(checkout_dir)

def get_last_repo_cloned(self) -> Optional[git.Repo]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1434,6 +1434,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
checkout_dir = git_clone.clone(
ssh_key=self.source_config.github_info.deploy_key,
repo_url=self.source_config.github_info.repo_ssh_locator,
branch=self.source_config.github_info.branch_for_clone,
)
self.reporter.git_clone_latency = datetime.now() - start_time
self.source_config.base_folder = checkout_dir.resolve()
Expand Down Expand Up @@ -1463,6 +1464,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
)
),
repo_url=p_ref.repo_ssh_locator,
branch=p_ref.branch_for_clone,
)

p_ref = p_checkout_dir.resolve()
Expand Down
65 changes: 42 additions & 23 deletions metadata-ingestion/tests/integration/git/test_git_clone.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,50 @@
import os

import pytest
from pydantic import SecretStr

from datahub.configuration.github import GitHubInfo
from datahub.ingestion.source.git.git_import import GitClone

LOOKML_TEST_SSH_KEY = os.environ.get("DATAHUB_LOOKML_GIT_TEST_SSH_KEY")

def test_git_clone(pytestconfig, tmp_path):

@pytest.mark.skipif(
LOOKML_TEST_SSH_KEY is None,
reason="DATAHUB_LOOKML_GIT_TEST_SSH_KEY env variable is not configured",
)
def test_git_clone(tmp_path):
git_clone = GitClone(str(tmp_path))
secret_env_variable = "DATAHUB_LOOKML_GIT_TEST_SSH_KEY"
if os.environ.get(secret_env_variable) is not None:
secret_key = SecretStr(os.environ.get(secret_env_variable)) # type: ignore
checkout_dir = git_clone.clone(
ssh_key=secret_key,
repo_url="[email protected]:acryldata/long-tail-companions-looker",
)
assert os.path.exists(checkout_dir)
assert set(os.listdir(checkout_dir)) == set(
[
".datahub",
"models",
"README.md",
".github",
".git",
"views",
]
)
else:
print(
"Skipping test as env variable DATAHUB_LOOKML_GIT_TEST_SSH_KEY is not configured"
)
secret_key = SecretStr(LOOKML_TEST_SSH_KEY) if LOOKML_TEST_SSH_KEY else None

checkout_dir = git_clone.clone(
ssh_key=secret_key,
repo_url="[email protected]:acryldata/long-tail-companions-looker",
branch="d380a2b777ec6f4653626f39c68dba85893faa74",
)
assert os.path.exists(checkout_dir)
assert set(os.listdir(checkout_dir)) == set(
[
".datahub",
"models",
"README.md",
".github",
".git",
"views",
"manifest_lock.lkml",
"manifest.lkml",
]
)


def test_github_branch():
config = GitHubInfo(
repo="owner/repo",
)
assert config.branch_for_clone is None

config = GitHubInfo(
repo="owner/repo",
branch="main",
)
assert config.branch_for_clone == "main"

0 comments on commit 8b1dc4b

Please sign in to comment.