-
Notifications
You must be signed in to change notification settings - Fork 318
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
upstream run level lineage implementation #2658
Changes from 5 commits
b4944d7
2007c82
059c7da
400644a
a118c06
50a8ed5
61da280
870557f
5f99805
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,15 +5,22 @@ | |
|
||
package marquez.db; | ||
|
||
import java.time.Instant; | ||
import java.util.Collection; | ||
import java.util.List; | ||
import java.util.Optional; | ||
import java.util.Set; | ||
import java.util.UUID; | ||
import javax.validation.constraints.NotNull; | ||
import marquez.common.models.DatasetName; | ||
import marquez.common.models.JobName; | ||
import marquez.common.models.NamespaceName; | ||
import marquez.common.models.RunId; | ||
import marquez.db.mappers.DatasetDataMapper; | ||
import marquez.db.mappers.JobDataMapper; | ||
import marquez.db.mappers.JobRowMapper; | ||
import marquez.db.mappers.RunMapper; | ||
import marquez.db.mappers.UpstreamRunRowMapper; | ||
import marquez.service.models.DatasetData; | ||
import marquez.service.models.JobData; | ||
import marquez.service.models.Run; | ||
|
@@ -25,8 +32,18 @@ | |
@RegisterRowMapper(JobDataMapper.class) | ||
@RegisterRowMapper(RunMapper.class) | ||
@RegisterRowMapper(JobRowMapper.class) | ||
@RegisterRowMapper(UpstreamRunRowMapper.class) | ||
public interface LineageDao { | ||
|
||
public record JobSummary(NamespaceName namespace, JobName name, UUID version) {} | ||
|
||
public record RunSummary(RunId id, Instant start, Instant end, String status) {} | ||
|
||
public record DatasetSummary( | ||
NamespaceName namespace, DatasetName name, UUID version, RunId producedByRunId) {} | ||
|
||
public record UpstreamRunRow(JobSummary job, RunSummary run, DatasetSummary input) {} | ||
|
||
/** | ||
* Fetch all of the jobs that consume or produce the datasets that are consumed or produced by the | ||
* input jobIds. This returns a single layer from the BFS using datasets as edges. Jobs that have | ||
|
@@ -154,4 +171,51 @@ SELECT DISTINCT on(r.job_name, r.namespace_name) r.*, jv.version as job_version | |
WHERE j.uuid in (<jobUuid>) OR j.symlink_target_uuid IN (<jobUuid>) | ||
ORDER BY r.job_name, r.namespace_name, created_at DESC""") | ||
List<Run> getCurrentRuns(@BindList Collection<UUID> jobUuid); | ||
|
||
@SqlQuery( | ||
""" | ||
WITH RECURSIVE | ||
upstream_runs( | ||
r_uuid, -- run uuid | ||
dataset_uuid, dataset_version_uuid, dataset_namespace, dataset_name, -- input dataset version to the run | ||
u_r_uuid, -- upstream run that produced that dataset version | ||
depth -- current depth of traversal | ||
) AS ( | ||
|
||
-- initial case: find the inputs of the initial runs | ||
select r.uuid, | ||
dv.dataset_uuid, dv."version", dv.namespace_name, dv.dataset_name, | ||
dv.run_uuid, | ||
0 AS depth -- starts at 0 | ||
FROM (SELECT :runId::uuid AS uuid) r -- initial run | ||
LEFT JOIN runs_input_mapping rim ON rim.run_uuid = r.uuid | ||
LEFT JOIN dataset_versions dv ON dv.uuid = rim.dataset_version_uuid | ||
|
||
UNION | ||
|
||
-- recursion: find the inputs of the inputs found on the previous iteration and increase depth to know when to stop | ||
SELECT | ||
ur.u_r_uuid, | ||
dv2.dataset_uuid, dv2."version", dv2.namespace_name, dv2.dataset_name, | ||
dv2.run_uuid, | ||
ur.depth + 1 AS depth -- increase depth to check end condition | ||
FROM upstream_runs ur | ||
LEFT JOIN runs_input_mapping rim2 ON rim2.run_uuid = ur.u_r_uuid | ||
LEFT JOIN dataset_versions dv2 ON dv2.uuid = rim2.dataset_version_uuid | ||
-- end condition of the recursion: no input or depth is over the maximum set | ||
-- also avoid following cycles (ex: merge statement) | ||
WHERE ur.u_r_uuid IS NOT NULL AND ur.u_r_uuid <> ur.r_uuid AND depth < :depth | ||
pawel-big-lebowski marked this conversation as resolved.
Show resolved
Hide resolved
|
||
) | ||
|
||
-- present the result: use Distinct as we may have traversed the same edge multiple times if there are diamonds in the graph. | ||
SELECT * FROM ( -- we need the extra statement to sort after the DISTINCT | ||
SELECT DISTINCT ON (upstream_runs.r_uuid, upstream_runs.dataset_version_uuid, upstream_runs.u_r_uuid) | ||
upstream_runs.*, | ||
r.started_at, r.ended_at, r.current_run_state as state, | ||
r.job_uuid, r.job_version_uuid, r.namespace_name as job_namespace, r.job_name | ||
FROM upstream_runs, runs r WHERE upstream_runs.r_uuid = r.uuid | ||
) sub | ||
ORDER BY depth ASC; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would it make sense to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. good idea There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
""") | ||
List<UpstreamRunRow> getUpstreamRuns(@NotNull UUID runId, int depth); | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
/* | ||
* Copyright 2018-2023 contributors to the Marquez project | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
|
||
package marquez.db.mappers; | ||
|
||
import static marquez.db.Columns.stringOrNull; | ||
import static marquez.db.Columns.stringOrThrow; | ||
import static marquez.db.Columns.timestampOrNull; | ||
import static marquez.db.Columns.uuidOrThrow; | ||
|
||
import java.sql.ResultSet; | ||
import java.sql.SQLException; | ||
import java.util.Optional; | ||
import java.util.UUID; | ||
import lombok.NonNull; | ||
import marquez.common.models.DatasetName; | ||
import marquez.common.models.JobName; | ||
import marquez.common.models.NamespaceName; | ||
import marquez.common.models.RunId; | ||
import marquez.db.Columns; | ||
import marquez.db.LineageDao.DatasetSummary; | ||
import marquez.db.LineageDao.JobSummary; | ||
import marquez.db.LineageDao.RunSummary; | ||
import marquez.db.LineageDao.UpstreamRunRow; | ||
import org.jdbi.v3.core.mapper.RowMapper; | ||
import org.jdbi.v3.core.statement.StatementContext; | ||
|
||
/** Maps the upstream query result set to a UpstreamRunRow */ | ||
public final class UpstreamRunRowMapper implements RowMapper<UpstreamRunRow> { | ||
@Override | ||
public UpstreamRunRow map(@NonNull ResultSet results, @NonNull StatementContext context) | ||
throws SQLException { | ||
return new UpstreamRunRow( | ||
new JobSummary( | ||
new NamespaceName(stringOrThrow(results, "job_namespace")), | ||
new JobName(stringOrThrow(results, "job_name")), | ||
Optional.ofNullable(stringOrNull(results, "job_version_uuid")) | ||
.map(UUID::fromString) | ||
.orElse(null)), | ||
new RunSummary( | ||
new RunId(uuidOrThrow(results, "r_uuid")), | ||
timestampOrNull(results, Columns.STARTED_AT), | ||
timestampOrNull(results, Columns.ENDED_AT), | ||
stringOrThrow(results, Columns.STATE)), | ||
results.getObject("dataset_name") == null | ||
? null | ||
: new DatasetSummary( | ||
new NamespaceName(stringOrThrow(results, "dataset_namespace")), | ||
new DatasetName(stringOrThrow(results, "dataset_name")), | ||
UUID.fromString(stringOrThrow(results, "dataset_version_uuid")), | ||
new RunId(UUID.fromString(stringOrThrow(results, "u_r_uuid"))))); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please mind documenting this in
openapi.spec
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done