datahub-project · anshbansal · Aug 2, 2023 · Jun 26, 2023 · Jun 26, 2023 · Jun 26, 2023
diff --git a/metadata-ingestion/docs/sources/openapi/openapi.md b/metadata-ingestion/docs/sources/openapi/openapi.md
@@ -2,11 +2,11 @@ The dataset metadata should be defined directly in the Swagger file, section `["
 
 ## Capabilities
 
-The plugin read the swagger file where the endopints are defined and searches for the ones which accept
-a `GET` call: those are the ones supposed to give back the datasets.
+This plugin reads the swagger file where the endpoints are defined, reads example data if provided (for any method), or searches for
+data for the endpoints which do not have example data and accept a `GET` call.
 
 For every selected endpoint defined in the `paths` section,
-the tool searches whether the medatada are already defined in there.
+the tool searches whether the metadata are already defined.
 As example, if in your swagger file there is the `/api/users/` defined as follows:
 
 ```yaml
@@ -27,7 +27,7 @@ paths:
 
 then this plugin has all the information needed to create the dataset in DataHub.
 
-In case there is no example defined, the plugin will try to get the metadata directly from the endpoint.
+In case there is no example defined, the plugin will try to get the metadata directly from the endpoint, if it is a `GET` method.
 So, if in your swagger file you have
 
 ```yaml
@@ -42,7 +42,7 @@ paths:
           description: Return the list of colors
 ```
 
-the tool will make a `GET` call to `https:///test_endpoint.com/colors`
+the tool will make a `GET` call to `https://test_endpoint.com/colors`
 and parse the response obtained.
 
 ### Automatically recorded examples
@@ -53,7 +53,7 @@ Sometimes you can have an endpoint which wants a parameter to work, like
 Since in the OpenApi specifications the listing endpoints are specified
 just before the detailed ones, in the list of the paths, you will find
 
-    https:///test_endpoint.com/colors
+    https://test_endpoint.com/colors
 
 defined before
 
@@ -80,7 +80,7 @@ and this last URL will be called to get back the needed metadata.
 If no useful example is found, a second procedure will try to guess a numerical ID.
 So if we have:
 
-    https:///test_endpoint.com/colors/{colorID}
+    https://test_endpoint.com/colors/{colorID}
 
 and there is no `colorID` example already found by the plugin,
 it will try to put a number one (1) at the parameter place
@@ -120,8 +120,8 @@ paths:
           description: Return details about the group
 ```
 
-and the plugin did not found an example in its previous calls,
-so the tool have no idea about what substitute to the `{name}` part.
+and the plugin did not find an example in its previous calls,
+the tool has no idea about what to substitute for the `{name}` part.
 
 By specifying in the configuration file
 

diff --git a/metadata-ingestion/src/datahub/ingestion/source/openapi.py b/metadata-ingestion/src/datahub/ingestion/source/openapi.py
@@ -250,6 +250,12 @@ def get_workunits_internal(self) -> Iterable[ApiWorkUnit]:  # noqa: C901
                 schema_metadata = set_metadata(dataset_name, endpoint_dets["data"])
                 dataset_snapshot.aspects.append(schema_metadata)
                 yield self.build_wu(dataset_snapshot, dataset_name)
+            elif endpoint_dets["method"] != "get":
+                self.report.report_warning(
+                    key=endpoint_k,
+                    reason=f"No example provided for {endpoint_dets['method']}",
+                )
+                continue  # Only test endpoints if they're GETs
             elif (
                 "{" not in endpoint_k
             ):  # if the API does not explicitly require parameters

diff --git a/metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py b/metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py
@@ -89,10 +89,13 @@ def get_swag_json(
 
 
 def get_url_basepath(sw_dict: dict) -> str:
-    try:
+    if "basePath" in sw_dict:
         return sw_dict["basePath"]
-    except KeyError:  # no base path defined
-        return ""
+    if "servers" in sw_dict:
+        # When the API path doesn't match the OAS path
+        return sw_dict["servers"][0]["url"]
+
+    return ""
 
 
 def check_sw_version(sw_dict: dict) -> None:
@@ -111,74 +114,80 @@ def check_sw_version(sw_dict: dict) -> None:
 
 def get_endpoints(sw_dict: dict) -> dict:  # noqa: C901
     """
-    Get all the URLs accepting the "GET" method, together with their description and the tags
+    Get all the URLs, together with their description and the tags
     """
     url_details = {}
 
     check_sw_version(sw_dict)
 
     for p_k, p_o in sw_dict["paths"].items():
-        # will track only the "get" methods, which are the ones that give us data
-        if "get" in p_o.keys():
-            if "200" in p_o["get"]["responses"].keys():
-                base_res = p_o["get"]["responses"]["200"]
-            elif 200 in p_o["get"]["responses"].keys():
-                # if you read a plain yml file the 200 will be an integer
-                base_res = p_o["get"]["responses"][200]
-            else:
-                # the endpoint does not have a 200 response
-                continue
-
-            if "description" in p_o["get"].keys():
-                desc = p_o["get"]["description"]
-            elif "summary" in p_o["get"].keys():
-                desc = p_o["get"]["summary"]
-            else:  # still testing
-                desc = ""
-
-            try:
-                tags = p_o["get"]["tags"]
-            except KeyError:
-                tags = []
-
-            url_details[p_k] = {"description": desc, "tags": tags}
-
-            # trying if dataset is defined in swagger...
-            if "content" in base_res.keys():
-                res_cont = base_res["content"]
-                if "application/json" in res_cont.keys():
-                    ex_field = None
-                    if "example" in res_cont["application/json"]:
-                        ex_field = "example"
-                    elif "examples" in res_cont["application/json"]:
-                        ex_field = "examples"
-
-                    if ex_field:
-                        if isinstance(res_cont["application/json"][ex_field], dict):
-                            url_details[p_k]["data"] = res_cont["application/json"][
-                                ex_field
-                            ]
-                        elif isinstance(res_cont["application/json"][ex_field], list):
-                            # taking the first example
-                            url_details[p_k]["data"] = res_cont["application/json"][
-                                ex_field
-                            ][0]
-                    else:
-                        logger.warning(
-                            f"Field in swagger file does not give consistent data --- {p_k}"
-                        )
-                elif "text/csv" in res_cont.keys():
-                    url_details[p_k]["data"] = res_cont["text/csv"]["schema"]
-            elif "examples" in base_res.keys():
-                url_details[p_k]["data"] = base_res["examples"]["application/json"]
-
-            # checking whether there are defined parameters to execute the call...
-            if "parameters" in p_o["get"].keys():
-                url_details[p_k]["parameters"] = p_o["get"]["parameters"]
+        method = list(p_o)[0]
+        if "200" in p_o[method]["responses"].keys():
+            base_res = p_o[method]["responses"]["200"]
+        elif 200 in p_o[method]["responses"].keys():
+            # if you read a plain yml file the 200 will be an integer
+            base_res = p_o[method]["responses"][200]
+        else:
+            # the endpoint does not have a 200 response
+            continue
+
+        if "description" in p_o[method].keys():
+            desc = p_o[method]["description"]
+        elif "summary" in p_o[method].keys():
+            desc = p_o[method]["summary"]
+        else:  # still testing
+            desc = ""
+
+        try:
+            tags = p_o[method]["tags"]
+        except KeyError:
+            tags = []
+
+        url_details[p_k] = {"description": desc, "tags": tags, "method": method}
+
+        example_data = check_for_api_example_data(base_res, key)
+        if example_data:
+            url_details[p_k]["data"] = example_data
+
+        # checking whether there are defined parameters to execute the call...
+        if "parameters" in p_o[method].keys():
+            url_details[p_k]["parameters"] = p_o[method]["parameters"]
 
     return dict(sorted(url_details.items()))
 
 
+def check_for_api_example_data(base_res: dict, key: str) -> dict:
+    """
+    Try to determine if example data is defined for the endpoint, and return it
+    """
+    data = {}
+    if "content" in base_res.keys():
+        res_cont = base_res["content"]
+        if "application/json" in res_cont.keys():
+            ex_field = None
+            if "example" in res_cont["application/json"]:
+                ex_field = "example"
+            elif "examples" in res_cont["application/json"]:
+                ex_field = "examples"
+
+            if ex_field:
+                if isinstance(res_cont["application/json"][ex_field], dict):
+                    data = res_cont["application/json"][ex_field]
+                elif isinstance(res_cont["application/json"][ex_field], list):
+                    # taking the first example
+                    data = res_cont["application/json"][ex_field][0]
+            else:
+                logger.warning(
+                    f"Field in swagger file does not give consistent data --- {key}"
+                )
+        elif "text/csv" in res_cont.keys():
+            data = res_cont["text/csv"]["schema"]
+    elif "examples" in base_res.keys():
+        data = base_res["examples"]["application/json"]
+
+    return data
+
+
 def guessing_url_name(url: str, examples: dict) -> str:
     """
     given a url and dict of extracted data, we try to guess a working URL. Example:
@@ -314,12 +323,10 @@ def extract_fields(
             return ["contains_a_string"], {"contains_a_string": dict_data[0]}
         else:
             raise ValueError("unknown format")
-    if len(dict_data.keys()) > 1:
+    if len(dict_data) > 1:
         # the elements are directly inside the dict
         return flatten2list(dict_data), dict_data
-    dst_key = list(dict_data.keys())[
-        0
-    ]  # the first and unique key is the dataset's name
+    dst_key = list(dict_data)[0]  # the first and unique key is the dataset's name
 
     try:
         return flatten2list(dict_data[dst_key]), dict_data[dst_key]