Skip to content

Commit

Permalink
Merge branch 'master' into fix/dataProductUnset
Browse files Browse the repository at this point in the history
  • Loading branch information
david-leifker authored Oct 4, 2024
2 parents 6ea79e9 + 73d8a46 commit 89f9312
Show file tree
Hide file tree
Showing 86 changed files with 857 additions and 287 deletions.
4 changes: 4 additions & 0 deletions .github/workflows/metadata-io.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ on:
- "li-utils/**"
- "metadata-models/**"
- "metadata-io/**"
- ".github/workflows/metadata-io.yml"
pull_request:
branches:
- "**"
Expand All @@ -16,6 +17,7 @@ on:
- "li-utils/**"
- "metadata-models/**"
- "metadata-io/**"
- ".github/workflows/metadata-io.yml"
release:
types: [published]

Expand Down Expand Up @@ -52,6 +54,8 @@ jobs:
sudo apt-get remove 'dotnet-*' azure-cli || true
sudo rm -rf /usr/local/lib/android/ || true
sudo docker image prune -a -f || true
- name: Disk Check
run: df -h . && docker images
- uses: acryldata/sane-checkout-action@v3
- name: Set up JDK 17
uses: actions/setup-java@v4
Expand Down
2 changes: 1 addition & 1 deletion build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,7 @@ subprojects {
implementation externalDependency.annotationApi
constraints {
implementation("com.google.googlejavaformat:google-java-format:$googleJavaFormatVersion")
implementation('io.netty:netty-all:4.1.100.Final')
implementation('io.netty:netty-all:4.1.114.Final')
implementation('org.apache.commons:commons-compress:1.26.0')
implementation('org.apache.velocity:velocity-engine-core:2.3')
implementation('org.hibernate:hibernate-validator:6.0.20.Final')
Expand Down
2 changes: 1 addition & 1 deletion datahub-frontend/play.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ dependencies {
play('com.nimbusds:nimbus-jose-jwt:8.18')
play('com.typesafe.akka:akka-actor_2.12:2.6.20')
play(externalDependency.jsonSmart)
play('io.netty:netty-all:4.1.86.Final')
play('io.netty:netty-all:4.1.114.Final')
implementation(externalDependency.commonsText) {
because("previous versions are vulnerable to CVE-2022-42889")
}
Expand Down
2 changes: 2 additions & 0 deletions docker/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ dockerCompose {
environment.put "ACTIONS_EXTRA_PACKAGES", 'acryl-datahub-actions[executor] acryl-datahub-actions'
environment.put "ACTIONS_CONFIG", 'https://raw.githubusercontent.com/acryldata/datahub-actions/main/docker/config/executor.yaml'
environment.put 'DATAHUB_TELEMETRY_ENABLED', 'false' // disabled when built locally
// disabled for spark-lineage smoke-test
environment.put 'DATAHUB_LOCAL_COMMON_ENV', "${rootProject.project(':metadata-integration:java:spark-lineage-legacy').projectDir}/spark-smoke-test/smoke-gms.env"

useComposeFiles = ['profiles/docker-compose.yml']
projectName = 'datahub'
Expand Down
46 changes: 43 additions & 3 deletions docs-website/adoptionStoriesIndexes.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@
{
"name": "Visa",
"slug": "visa",
"imageUrl": "/img/logos/companies/visa.png",
"imageSize": "large",
"link": "https://blog.datahubproject.io/how-visa-uses-datahub-to-scale-data-governance-cace052d61c5",
"linkType": "blog",
"tagline": "How Visa uses DataHub to scale data governance",
Expand Down Expand Up @@ -56,6 +54,12 @@
"category": "B2B & B2C",
"description": "Pinterest adopted a DataHub project to enhance metadata management for its big data query platform, facilitating better data navigation and understanding."
},
{
"name": "Snap, Inc.",
"slug": "snap",
"imageUrl": "/img/logos/companies/snap.png",
"imageSize": "small"
},
{
"name": "Airtel",
"slug": "airtel",
Expand Down Expand Up @@ -89,6 +93,42 @@
"category": "B2B & B2C",
"description": "<i>“We looked around for data catalog tool, and DataHub was a clear winner.”</i> <br /> <br /> Zynga levels up data management using DataHub, highlighting its role in enhancing data management, tracing data lineage, and ensuring data quality."
},
{
"name": "Miro",
"slug": "miro",
"imageUrl": "/img/logos/companies/miro.png",
"imageSize": "medium",
"link": "https://miro.com/careers/life-at-miro/tech/data-products-reliability-the-power-of-metadata/",
"linkType": "blog",
"tagline": "Data Products Reliability: The Power of Metadata",
"category": "B2B & B2C",
"platform": "cloud",
"description": "<i>\"Leveraging our Datahub catalog, we have centralized metadata access for all data products. This integration eliminates the dependency on Airflow metadata alone for defining contracts, enabling flexible definitions for both building blocks and business metrics.\"</i>"
},
{
"name": "Foursquare",
"slug": "foursquare",
"imageUrl": "/img/logos/companies/foursquare.png",
"imageSize": "medium",
"link": "https://location.foursquare.com/resources/blog/leadership/foursquare-data-platform-from-fragmentation-to-control-plane/",
"linkType": "blog",
"tagline": "Foursquare Data Platform: From Fragmentation to Control (Plane)",
"category": "B2B & B2C",
"platform": "cloud",
"description": "<i>\"After evaluating several options (...) we chose DataHub as the control plane for our data platform and partnered with Acryl Data, which offers DataHub Cloud, a premium hosted version of DataHub. (...) Another important thing that worked in DataHub’s favor is the rich and flexible taxonomy it offered for modeling the various aspects of a data platform.\"</i>"
},
{
"name": "Deutsche Telekom",
"slug": "deutsche-telekom",
"imageUrl": "/img/logos/companies/deutsche-telekom.png",
"imageSize": "medium",
"link": "https://karanjindal95.medium.com/from-chaos-to-clarity-how-datahub-transformed-our-data-utilization-5b5151efd34a",
"linkType": "blog",
"tagline": "From Chaos to Clarity: How DataHub Transformed our Data Utilization",
"category": "B2B & B2C",
"platform": "cloud",
"description": "<i>\"The DataHub data catalog significantly supported our AI/ML team’s efforts by offering seamless access to detailed column descriptions and table schemas through its APIs. This comprehensive data accessibility enabled the team to efficiently develop a text-to-SQL tool, which translates natural language queries into SQL commands.\"</i>"
},
{
"name": "Chime",
"slug": "chime",
Expand Down Expand Up @@ -374,4 +414,4 @@
"category": "And More"
}
]
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ const caseStudyData = [
tag: "Finance",
backgroundImage:
"https://miro.medium.com/v2/resize:fit:2000/format:webp/1*[email protected]",
image: "https://datahubproject.io/img/logos/companies/visa.png",
image: "/img/logos/companies/visa_text.png",
link: "https://datahubproject.io/adoption-stories/#visa",
},
{
Expand Down
2 changes: 1 addition & 1 deletion docs-website/src/pages/_components/Logos/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ export const CompanyLogos = () => (
className={clsx("mySwiper", styles.companyWrapper)}
>
{companies
.filter((company) => company.imageUrl) // Filter companies with imageUrl
.filter((company) => company.imageUrl && company.link) // Filter companies with imageUrl and link
.map((company, idx) => (
<SwiperSlide key={idx}>
{company.link ? (
Expand Down
13 changes: 11 additions & 2 deletions docs-website/src/pages/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ import CloseButton from "@ant-design/icons/CloseCircleFilled";

const companyIndexes = require("../../adoptionStoriesIndexes.json");
const companies = companyIndexes.companies;
const keyCompanySlugs = ["netflix", "visa", "pinterest", "airtel", "optum"];
const keyCompanySlugs = ["netflix", "pinterest", "notion", "snap", "optum"]; //, "airtel"];
const keyCompanies = keyCompanySlugs
.map((slug) => companies.find((co) => co.slug === slug))
.filter((isDefined) => isDefined);
Expand Down Expand Up @@ -63,7 +63,13 @@ function Home() {
</div>
<div className="company_logos_list_wrapper">
{keyCompanies.map((company) => (
<a href={`/adoption-stories#${company.slug}`}>
<a
href={
company.slug != "snap"
? `/adoption-stories#${company.slug}`
: undefined
}
>
<img
src={useBaseUrl(company.imageUrl)}
alt={company.name}
Expand All @@ -72,6 +78,9 @@ function Home() {
/>
</a>
))}
<a href="/adoption-stories" class="more_link">
+ More
</a>
</div>
{/* <div style={{ textAlign: "center", margin: "1rem" }}>
<Link
Expand Down
15 changes: 14 additions & 1 deletion docs-website/src/styles/global.scss
Original file line number Diff line number Diff line change
Expand Up @@ -399,7 +399,7 @@ div[class^="announcementBar"] {
.text {
padding-right: 24px;
color: #777;
font-size: 1.25rem;
font-size: 1.2rem;
line-height: 1.5rem;
}

Expand All @@ -409,6 +409,16 @@ div[class^="announcementBar"] {
align-items: center;
flex-grow: 1;
justify-content: space-around;


.more_link {
font-size: 1.25rem;
color: #bbb;
font-weight: 600;
text-decoration: none;
position: relative;
top: -.4rem;
}
}
.company_logo {
max-width: 80px;
Expand Down Expand Up @@ -486,6 +496,9 @@ div[class^="announcementBar"] {
}
.company_logos_list_wrapper {
width: 100%;
.more_link {
display: none;
}
}
.company_logo {
max-width: 60px;
Expand Down
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs-website/static/img/logos/companies/snap.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file removed docs-website/static/img/logos/companies/visa.png
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
)

from looker_sdk.error import SDKError
from looker_sdk.rtl.serialize import DeserializeError
from looker_sdk.sdk.api40.models import (
LookmlModelExplore,
LookmlModelExploreField,
Expand Down Expand Up @@ -1131,7 +1132,16 @@ def from_api( # noqa: C901
logger.warning(
f"Failed to extract explore {explore_name} from model {model}: {e}"
)

except DeserializeError as e:
reporter.warning(
title="Failed to fetch explore from the Looker API",
message=(
"An error occurred while extracting the explore from the model. "
"Please check the explore and model configurations."
),
context=f"Explore: {explore_name}, Model: {model}",
exc=e,
)
except AssertionError:
reporter.report_warning(
title="Unable to find Views",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
)

from looker_sdk.error import SDKError
from looker_sdk.rtl.serialize import DeserializeError
from looker_sdk.sdk.api40.models import (
Dashboard,
DashboardElement,
Expand Down Expand Up @@ -1288,12 +1289,13 @@ def process_dashboard(
dashboard_id=dashboard_id,
fields=fields,
)
except SDKError:
except (SDKError, DeserializeError) as e:
# A looker dashboard could be deleted in between the list and the get
self.reporter.report_warning(
title="Error Loading Dashboard",
title="Failed to fetch dashboard from the Looker API",
message="Error occurred while attempting to loading dashboard from Looker API. Skipping.",
context=f"Dashboard ID: {dashboard_id}",
exc=e,
)
return [], None, dashboard_id, start_time, datetime.datetime.now()

Expand Down
22 changes: 22 additions & 0 deletions metadata-ingestion/src/datahub/ingestion/source/nifi.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,10 +332,14 @@ def __init__(self) -> None:
}

def process_s3_provenance_event(self, event):
logger.debug(f"Processing s3 provenance event: {event}")
attributes = event.get("attributes", [])
s3_bucket = get_attribute_value(attributes, "s3.bucket")
s3_key = get_attribute_value(attributes, "s3.key")
if not s3_key:
logger.debug(
"s3.key not present in the list of attributes, trying to use filename attribute instead"
)
s3_key = get_attribute_value(attributes, "filename")

s3_url = f"s3://{s3_bucket}/{s3_key}"
Expand All @@ -344,6 +348,7 @@ def process_s3_provenance_event(self, event):
dataset_name = s3_path.replace("/", ".")
platform = "s3"
dataset_urn = builder.make_dataset_urn(platform, s3_path, self.env)
logger.debug(f"Reasoned s3 dataset urn: {dataset_urn}")
return ExternalDataset(
platform,
dataset_name,
Expand Down Expand Up @@ -910,6 +915,11 @@ def construct_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901
)

for component in self.nifi_flow.components.values():
logger.debug(
f"Beginng construction of workunits for component {component.id} of type {component.type} and name {component.name}"
)
logger.debug(f"Inlets of the component: {component.inlets.keys()}")
logger.debug(f"Outlets of the component: {component.outlets.keys()}")
job_name = component.name
job_urn = builder.make_data_job_urn_with_flow(flow_urn, component.id)

Expand Down Expand Up @@ -937,6 +947,9 @@ def construct_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901
jobProperties["last_event_time"] = component.last_event_time

for dataset in component.inlets.values():
logger.debug(
f"Yielding dataset workunits for {dataset.dataset_urn} (inlet)"
)
yield from self.construct_dataset_workunits(
dataset.platform,
dataset.dataset_name,
Expand All @@ -945,6 +958,9 @@ def construct_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901
)

for dataset in component.outlets.values():
logger.debug(
f"Yielding dataset workunits for {dataset.dataset_urn} (outlet)"
)
yield from self.construct_dataset_workunits(
dataset.platform,
dataset.dataset_name,
Expand Down Expand Up @@ -1207,6 +1223,7 @@ def construct_job_workunits(
inputJobs: List[str] = [],
status: Optional[str] = None,
) -> Iterable[MetadataWorkUnit]:
logger.debug(f"Begining construction of job workunit for {job_urn}")
if job_properties:
job_properties = {k: v for k, v in job_properties.items() if v is not None}

Expand All @@ -1229,8 +1246,12 @@ def construct_job_workunits(
inlets.sort()
outlets.sort()
inputJobs.sort()
logger.debug(f"Inlets after sorting: {inlets}")
logger.debug(f"Outlets after sorting: {outlets}")
logger.debug(f"Input jobs after sorting: {inputJobs}")

if self.config.incremental_lineage:
logger.debug("Preparing mcps for incremental lineage")
patch_builder: DataJobPatchBuilder = DataJobPatchBuilder(job_urn)
for inlet in inlets:
patch_builder.add_input_dataset(inlet)
Expand All @@ -1239,6 +1260,7 @@ def construct_job_workunits(
for inJob in inputJobs:
patch_builder.add_input_datajob(inJob)
for patch_mcp in patch_builder.build():
logger.debug(f"Preparing Patch MCP: {patch_mcp}")
yield MetadataWorkUnit(
id=f"{job_urn}-{patch_mcp.aspectName}", mcp_raw=patch_mcp
)
Expand Down
2 changes: 1 addition & 1 deletion metadata-ingestion/src/datahub/specific/datajob.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ def add_output_dataset(
self._add_patch(
DataJobInputOutput.ASPECT_NAME,
"add",
path=f"/outputDatasetEdges/{self.quote(str(input))}",
path=f"/outputDatasetEdges/{self.quote(str(output))}",
value=output_edge,
)
return self
Expand Down
Loading

0 comments on commit 89f9312

Please sign in to comment.