upstream run level lineage implementation #2658

julienledem · 2023-10-24T21:58:14Z

Problem

When trouble shooting an issue and doing root cause analysis, it is usefull to get the upstream run-level lineage to know exactly what version of each job and dataset a run is depending on.

Solution

return the upstream run level lineage.

Example:
http://localhost:3000/api/v1/runlineage/upstream?runId=adc8507c-595e-4d76-9dac-be2bf0ffe1ee

{
    "runs": [
        {
            "job": {
                "namespace": "food_delivery",
                "name": "orders_popular_day_of_week",
                "version": "a8f00e6e-88b0-4e5b-a0d8-e7edd3048292"
            },
            "run": {
                "id": "adc8507c-595e-4d76-9dac-be2bf0ffe1ee",
                "start": "2020-02-22T22:42:42Z",
                "end": "2020-02-22T22:46:12Z",
                "status": "COMPLETED"
            },
            "inputs": [
                {
                    "namespace": "food_delivery",
                    "name": "public.top_delivery_times",
                    "version": "18951d81-7201-38ca-a207-3f65c7552373",
                    "producedByRunId": "bd41a42a-bf18-4b74-9bb7-cd62637823d8"
                },
                {
                    "namespace": "food_delivery",
                    "name": "public.customers",
                    "version": "68c1e307-f6bb-36f9-8596-14609c7f022b",
                    "producedByRunId": "182a9eaf-881a-4d49-860c-f7e260b8bf60"
                }
            ]
        },
        {
            "job": {
                "namespace": "food_delivery",
                "name": "delivery_times_7_days",
                "version": "a3ea0de5-f86b-48a5-a5de-5e53a559f59d"
            },
            "run": {
                "id": "bd41a42a-bf18-4b74-9bb7-cd62637823d8",
                "start": "2020-02-22T22:42:42Z",
                "end": "2020-02-22T22:58:02Z",
                "status": "COMPLETED"
            },
            "inputs": [
                {
                    "namespace": "food_delivery",
                    "name": "public.delivery_7_days",
                    "version": "6f8f52f5-0230-31ce-a138-08b79e671b33",
                    "producedByRunId": "d5a2a4c4-fc78-428d-ae85-08c942ed8371"
                }
            ]
        },
        {
            "job": {
                "namespace": "food_delivery",
                "name": "etl_customers",
                "version": "f57c837f-8ef3-49be-98d4-14e4195518da"
            },
            "run": {
                "id": "182a9eaf-881a-4d49-860c-f7e260b8bf60",
                "start": "2020-02-22T22:42:42Z",
                "end": "2020-02-22T22:44:55Z",
                "status": "COMPLETED"
            },
            "inputs": []
        },
        {
            "job": {
                "namespace": "food_delivery",
                "name": "etl_delivery_7_days",
                "version": "312ac05b-6022-4fce-af4c-16728b27d605"
            },
            "run": {
                "id": "d5a2a4c4-fc78-428d-ae85-08c942ed8371",
                "start": "2020-02-22T22:42:42Z",
                "end": "2020-02-22T22:48:12Z",
                "status": "COMPLETED"
            },
            "inputs": [
                {
                    "namespace": "food_delivery",
                    "name": "public.orders_7_days",
                    "version": "d09633c4-4412-36de-bce6-8002c662e18a",
                    "producedByRunId": "ffba2c14-4170-48da-bec3-ab5fd4ec9a3f"
                },
                {
                    "namespace": "food_delivery",
                    "name": "public.customers",
                    "version": "68c1e307-f6bb-36f9-8596-14609c7f022b",
                    "producedByRunId": "182a9eaf-881a-4d49-860c-f7e260b8bf60"
                },
                {
                    "namespace": "food_delivery",
                    "name": "public.order_status",
                    "version": "676ac323-c8e3-3cea-b172-b468827afb51",
                    "producedByRunId": "b7098939-87f0-4207-878f-dfd8e8804d8a"
                },
                {
                    "namespace": "food_delivery",
                    "name": "public.drivers",
                    "version": "93ae26cc-87d8-3eae-9cdd-f9b6fd71f1f7",
                    "producedByRunId": "9f3db1c5-5e9a-4280-8184-18aca4592c77"
                },
                {
                    "namespace": "food_delivery",
                    "name": "public.restaurants",
                    "version": "4db26821-7966-390f-9cf9-ac775fe9182b",
                    "producedByRunId": "8ddfb1d9-415f-4850-bcd6-01d02f011abe"
                }
            ]
        },
        {
            "job": {
                "namespace": "food_delivery",
                "name": "etl_orders_7_days",
                "version": "d6014479-a795-48bf-b620-a4f781436600"
            },
            "run": {
                "id": "ffba2c14-4170-48da-bec3-ab5fd4ec9a3f",
                "start": "2020-02-22T22:42:42Z",
                "end": "2020-02-22T22:44:02Z",
                "status": "COMPLETED"
            },
            "inputs": [
                {
                    "namespace": "food_delivery",
                    "name": "public.menus",
                    "version": "54a67439-3426-3272-b825-c8dabc9da6c1",
                    "producedByRunId": "4d3b8069-69b6-4708-ade0-3275112c9f04"
                },
                {
                    "namespace": "food_delivery",
                    "name": "public.menu_items",
                    "version": "88618612-23c0-3fe5-89e7-d939883a2f52",
                    "producedByRunId": "e05901b1-3a06-4b98-8d9c-aaf188c9a28c"
                },
                {
                    "namespace": "food_delivery",
                    "name": "public.orders",
                    "version": "1e3f9ed6-eea2-3ff4-83ab-4efa9dc87f1c",
                    "producedByRunId": "a43a8523-349f-4296-807f-3354ac491990"
                },
                {
                    "namespace": "food_delivery",
                    "name": "public.categories",
                    "version": "20a6093a-6edc-3c58-b3ec-43d0efc14035",
                    "producedByRunId": "6f0c13a5-f29b-46a5-90c1-0ffbebbbd1aa"
                }
            ]
        },
        {
            "job": {
                "namespace": "food_delivery",
                "name": "etl_order_status",
                "version": "ca990d26-7909-4392-a3b8-f9415a6fd304"
            },
            "run": {
                "id": "b7098939-87f0-4207-878f-dfd8e8804d8a",
                "start": "2020-02-22T22:42:42Z",
                "end": "2020-02-22T22:44:52Z",
                "status": "COMPLETED"
            },
            "inputs": []
        },
        {
            "job": {
                "namespace": "food_delivery",
                "name": "etl_restaurants",
                "version": "4d9a0f8a-98e3-4081-8f4e-e18048881eb9"
            },
            "run": {
                "id": "8ddfb1d9-415f-4850-bcd6-01d02f011abe",
                "start": "2020-02-22T22:42:42Z",
                "end": "2020-02-22T22:44:56Z",
                "status": "COMPLETED"
            },
            "inputs": []
        },
        {
            "job": {
                "namespace": "food_delivery",
                "name": "etl_drivers",
                "version": "7325dcfd-9250-489f-8b79-993792d0a9be"
            },
            "run": {
                "id": "9f3db1c5-5e9a-4280-8184-18aca4592c77",
                "start": "2020-02-22T22:42:42Z",
                "end": "2020-02-22T22:44:52Z",
                "status": "COMPLETED"
            },
            "inputs": []
        },
        {
            "job": {
                "namespace": "food_delivery",
                "name": "etl_menus",
                "version": "6e761ef2-1173-4bb9-a80d-1a159982e9ca"
            },
            "run": {
                "id": "4d3b8069-69b6-4708-ade0-3275112c9f04",
                "start": "2020-02-22T22:42:42Z",
                "end": "2020-02-22T22:45:52Z",
                "status": "COMPLETED"
            },
            "inputs": []
        },
        {
            "job": {
                "namespace": "food_delivery",
                "name": "etl_menu_items",
                "version": "07341e00-85fe-47b2-93c9-8c37685c2124"
            },
            "run": {
                "id": "e05901b1-3a06-4b98-8d9c-aaf188c9a28c",
                "start": "2020-02-22T22:42:42Z",
                "end": "2020-02-22T22:44:54Z",
                "status": "COMPLETED"
            },
            "inputs": []
        },
        {
            "job": {
                "namespace": "food_delivery",
                "name": "etl_categories",
                "version": "133be69f-8648-4e00-903a-2282e895d3b7"
            },
            "run": {
                "id": "6f0c13a5-f29b-46a5-90c1-0ffbebbbd1aa",
                "start": "2020-02-22T22:42:42Z",
                "end": "2020-02-22T22:44:52Z",
                "status": "COMPLETED"
            },
            "inputs": []
        },
        {
            "job": {
                "namespace": "food_delivery",
                "name": "etl_orders",
                "version": "12dca2cf-c0ed-491f-8ad6-d4813f9c7cb6"
            },
            "run": {
                "id": "a43a8523-349f-4296-807f-3354ac491990",
                "start": "2020-02-22T22:42:42Z",
                "end": "2020-02-22T22:44:52Z",
                "status": "COMPLETED"
            },
            "inputs": []
        }
    ]
}

Checklist

You've signed-off your work
Your changes are accompanied by tests (if relevant)
Your change contains a small diff and is self-contained
You've updated any relevant documentation (if relevant)
You've included a one-line summary of your change for the CHANGELOG.md (Depending on the change, this may not be necessary).
You've versioned your .sql database schema migration according to Flyway's naming convention (if relevant)
You've included a header in any source code files (if relevant)

Signed-off-by: Julien Le Dem <[email protected]>

netlify · 2023-10-24T21:58:19Z

✅ Deploy Preview for peppy-sprite-186812 canceled.

Name	Link
🔨 Latest commit	`5f99805`
🔍 Latest deploy log	https://app.netlify.com/sites/peppy-sprite-186812/deploys/65495b2ba8011600082ad5c0

julienledem · 2023-10-24T21:58:24Z

TODO:
tests and doc

Signed-off-by: Julien Le Dem <[email protected]>

codecov · 2023-10-25T00:02:03Z

Codecov Report

Merging #2658 (5f99805) into main (42cadbb) will increase coverage by 0.16%.
Report is 1 commits behind head on main.
The diff coverage is 96.29%.

@@             Coverage Diff              @@
##               main    #2658      +/-   ##
============================================
+ Coverage     83.60%   83.76%   +0.16%     
- Complexity     1334     1338       +4     
============================================
  Files           245      247       +2     
  Lines          6076     6112      +36     
  Branches        280      281       +1     
============================================
+ Hits           5080     5120      +40     
+ Misses          844      843       -1     
+ Partials        152      149       -3

Files	Coverage Δ
api/src/main/java/marquez/db/Columns.java	`86.30% <100.00%> (+6.55%)`	⬆️
api/src/main/java/marquez/db/LineageDao.java	`100.00% <100.00%> (ø)`
.../java/marquez/db/mappers/UpstreamRunRowMapper.java	`100.00% <100.00%> (ø)`
.../src/main/java/marquez/service/LineageService.java	`83.22% <100.00%> (+2.16%)`	⬆️
...src/main/java/marquez/api/OpenLineageResource.java	`86.84% <0.00%> (-4.83%)`	⬇️

📣 Codecov offers a browser extension for seamless coverage viewing on GitHub. Try it in Chrome or Firefox today!

pawel-big-lebowski

Another PR with a moderate size and huge feature contained 👍
First-string 💯

pawel-big-lebowski · 2023-10-25T08:15:08Z

api/src/main/java/marquez/db/LineageDao.java

+          0 AS depth
+    FROM runs r
+    LEFT JOIN runs_input_mapping rim ON rim.run_uuid = r.uuid
+    LEFT JOIN dataset_versions dv ON dv.uuid = rim.dataset_version_uuid


Just thinking loudly. Wouldn't be better to join dataset_versions after the recursion at the bottom of the query once all the runs are identified?

I thought of that as well but we actually need dataset_versions in the recursion because this is where we find the run_uuid that produced the DV for the next iteration.

idea: we could add that run uuid to runs_input_mapping at the same time as the dataset_version that would allow to join just on that table in the recursion. That'd be neat.

Didn't get the idea: do you want to add dataset_versions to runs_input_mapping or run_uuid to dataset_versions?
Anyway, the current state of the query looks good to me.

The idea is we we could add the dataset_versions.run_uuid to the runs_input_mapping table.
The run_uuid is always already set in the dataset_versions table when we write the dataset_versions.uuid to the runs_input_mapping table and it never changes since it is the run that created that dataset version.
Then you just need to recursively join the runs_input_mapping over itself

but I would keep that for future improvement.

pawel-big-lebowski · 2023-10-25T08:21:40Z

api/src/main/java/marquez/db/LineageDao.java

+          dv2.run_uuid,
+          ur.depth + 1 AS depth
+    FROM upstream_runs ur
+    INNER JOIN runs r2 ON r2.uuid = ur.u_r_uuid


runs table can get huge. Is it doable to omit it in recursion step?

This is where I find the job information (r2.job_uuid, r2.job_version_uuid, r2.namespace_name, r2.job_name). Do you think I could get it a different way?
unless we add it to runs_input_mapping?

But I think I could actually move the runs join out of the recursion into the final select statement.

I just updated the query to move the joins on the runs table outside of the recursion and in the final select statement.

phixMe · 2023-10-25T15:57:07Z

api/src/main/java/marquez/api/OpenLineageResource.java

+  public Response getRunLineageUpstream(
+      @QueryParam("runId") @NotNull RunId runId,
+      @QueryParam("depth") @DefaultValue(DEFAULT_DEPTH) int depth,
+      @QueryParam("facets") String facets) {


Why are we pulling this in? I don't see it being used in the service. Did you intend to do something in the TODO block you left in there?

Yes, my idea is to be able to select what facets to return for each dataset_version job_version and run in the result.
I can remove it for now.

I removed this parameter in this iteration

Signed-off-by: Julien Le Dem <[email protected]>

pawel-big-lebowski · 2023-10-26T06:21:16Z

api/src/main/java/marquez/db/LineageDao.java

+  -- present the result: use Distinct as we may have traversed the same edge multiple times if there are diamonds in the graph.
+SELECT DISTINCT ON (upstream_runs.r_uuid, upstream_runs.dataset_version_uuid, upstream_runs.u_r_uuid)
+       upstream_runs.*,
+       -- we add the run information after the recursion so that we join with the large run table only once


pawel-big-lebowski · 2023-10-26T06:27:10Z

The overall idea looks good to me. Once the tests are created, I would be happy to approve it.

Signed-off-by: Julien Le Dem <[email protected]>

julienledem · 2023-11-01T15:25:41Z

The overall idea looks good to me. Once the tests are created, I would be happy to approve it.

@pawel-big-lebowski: done.

julienledem · 2023-11-01T15:30:54Z

I also tested this manually on actual production data for performance and it did well.

pawel-big-lebowski

This looks good to me.
Please look at the comments and let me know do you think.
I am happy to approve PR afterwards.

pawel-big-lebowski · 2023-11-02T08:12:05Z

api/src/main/java/marquez/api/OpenLineageResource.java

+  @Consumes(APPLICATION_JSON)
+  @Produces(APPLICATION_JSON)
+  @Path("/runlineage/upstream")
+  public Response getRunLineageUpstream(


Please mind documenting this in openapi.spec

api/src/main/java/marquez/db/LineageDao.java

pawel-big-lebowski · 2023-11-02T08:43:27Z

api/src/test/java/marquez/db/LineageDaoTest.java

+            "COMPLETE",
+            jobFacet,
+            Arrays.asList(),
+            Arrays.asList(dataset));


Would it make sense to add input dataset here read from some other job?
Currently recursion is not tested.

pawel-big-lebowski · 2023-11-02T08:45:45Z

api/src/main/java/marquez/db/LineageDao.java

+            r.job_uuid, r.job_version_uuid, r.namespace_name as job_namespace, r.job_name
+          FROM upstream_runs, runs r WHERE upstream_runs.r_uuid = r.uuid
+        ) sub
+      ORDER BY depth ASC;


Would it make sense to ORDER BY depth ASC, r.job_name ASC;
Tests rely on the ordering of the query output.

Signed-off-by: Julien Le Dem <[email protected]>

upstream run level lineage implementation

b4944d7

Signed-off-by: Julien Le Dem <[email protected]>

boring-cyborg bot added the api API layer changes label Oct 24, 2023

fix ordering and spurious nulls bug

2007c82

Signed-off-by: Julien Le Dem <[email protected]>

julienledem requested review from wslulciuc and pawel-big-lebowski October 24, 2023 23:53

pawel-big-lebowski reviewed Oct 25, 2023

View reviewed changes

phixMe reviewed Oct 25, 2023

View reviewed changes

julienledem added 2 commits October 25, 2023 09:14

move runs table out of the recursion

059c7da

Signed-off-by: Julien Le Dem <[email protected]>

add comments and small improvement to the query

400644a

Signed-off-by: Julien Le Dem <[email protected]>

pawel-big-lebowski reviewed Oct 26, 2023

View reviewed changes

add tests

a118c06

Signed-off-by: Julien Le Dem <[email protected]>

boring-cyborg bot added the docs label Oct 31, 2023

pawel-big-lebowski reviewed Nov 2, 2023

View reviewed changes

julienledem added 2 commits November 2, 2023 15:50

Merge branch 'main' into upstream

50a8ed5

add openapi spec

61da280

Signed-off-by: Julien Le Dem <[email protected]>

boring-cyborg bot added the spec label Nov 2, 2023

pawel-big-lebowski approved these changes Nov 3, 2023

View reviewed changes

julienledem added 2 commits November 6, 2023 10:26

review feedback: improve tests; query

870557f

Signed-off-by: Julien Le Dem <[email protected]>

Merge branch 'main' into upstream

5f99805

julienledem enabled auto-merge (squash) November 6, 2023 21:31

julienledem merged commit 752ac23 into main Nov 6, 2023
15 checks passed

julienledem deleted the upstream branch November 6, 2023 21:39

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

upstream run level lineage implementation #2658

upstream run level lineage implementation #2658

julienledem commented Oct 24, 2023 •

edited

Loading

netlify bot commented Oct 24, 2023 •

edited

Loading

julienledem commented Oct 24, 2023

codecov bot commented Oct 25, 2023 •

edited

Loading

pawel-big-lebowski left a comment

pawel-big-lebowski Oct 25, 2023

julienledem Oct 25, 2023

julienledem Oct 25, 2023

pawel-big-lebowski Oct 26, 2023

julienledem Nov 1, 2023

julienledem Nov 1, 2023

pawel-big-lebowski Oct 25, 2023 •

edited

Loading

julienledem Oct 25, 2023

julienledem Oct 25, 2023

julienledem Oct 25, 2023

phixMe Oct 25, 2023

julienledem Oct 25, 2023

julienledem Nov 1, 2023

pawel-big-lebowski Oct 26, 2023

pawel-big-lebowski commented Oct 26, 2023

julienledem commented Nov 1, 2023

julienledem commented Nov 1, 2023

pawel-big-lebowski left a comment

pawel-big-lebowski Nov 2, 2023

julienledem Nov 6, 2023

pawel-big-lebowski Nov 2, 2023

julienledem Nov 6, 2023

pawel-big-lebowski Nov 2, 2023

julienledem Nov 2, 2023

julienledem Nov 6, 2023

upstream run level lineage implementation #2658

upstream run level lineage implementation #2658

Conversation

julienledem commented Oct 24, 2023 • edited Loading

Problem

Solution

Checklist

netlify bot commented Oct 24, 2023 • edited Loading

✅ Deploy Preview for peppy-sprite-186812 canceled.

julienledem commented Oct 24, 2023

codecov bot commented Oct 25, 2023 • edited Loading

Codecov Report

pawel-big-lebowski left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

pawel-big-lebowski Oct 25, 2023 • edited Loading

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

pawel-big-lebowski commented Oct 26, 2023

julienledem commented Nov 1, 2023

julienledem commented Nov 1, 2023

pawel-big-lebowski left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

julienledem commented Oct 24, 2023 •

edited

Loading

netlify bot commented Oct 24, 2023 •

edited

Loading

codecov bot commented Oct 25, 2023 •

edited

Loading

pawel-big-lebowski Oct 25, 2023 •

edited

Loading