-
Notifications
You must be signed in to change notification settings - Fork 159
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add cleanup functions to fix recursive loop and no such file raised by broken symlinks #1115
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It sounds like an operator or decorator. I'm ok with keeping it in the project's root path, but what are your thoughts on placing it in the operator directory instead? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The challenge is that while |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
""" | ||
Use this script locally to identify broken symbolic links or recursive loops locally: | ||
$ python -m cosmos.cleanup -p <dir-path> | ||
To delete the issues identified, run: | ||
$ python -m cosmos.cleanup -p <dir-path> -d | ||
""" | ||
|
||
import argparse | ||
import logging | ||
import os | ||
from pathlib import Path | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
def identify_broken_symbolic_links(dir_path: str, should_delete: bool = False) -> None: | ||
""" | ||
Given a directory, recursively inspect it in search for symbolic links. | ||
If should_delete is set to True, delete the symbolic links identified. | ||
:param dir_path: Path to the directory to be analysed | ||
:param should_delete: Users should set to True if they want the method to not only identify but also delete these links. | ||
""" | ||
logger.info(f"Inspecting the directory {dir_path} for broken symbolic links.") | ||
filepaths = [] | ||
broken_symlinks_count = 0 | ||
deleted_symlinks_count = 0 | ||
for root_dir, dirs, files in os.walk(dir_path): | ||
paths = [os.path.join(root_dir, filepath) for filepath in files] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should we check here if the filepath is a symlink? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It was being checked after, but it would be a good improvement! |
||
filepaths.extend(paths) | ||
|
||
for filepath in filepaths: | ||
try: | ||
os.stat(filepath) | ||
except OSError: | ||
broken_symlinks_count += 1 | ||
logger.warning(f"The folder {dir_path} contains a symbolic link to a non-existent file: {filepath}") | ||
if should_delete: | ||
logger.info(f"Deleting the invalid symbolic link: {filepath}") | ||
os.unlink(filepath) | ||
deleted_symlinks_count += 1 | ||
|
||
logger.info( | ||
f"After inspecting {dir_path}, identified {broken_symlinks_count} broken links and deleted {deleted_symlinks_count} of them." | ||
) | ||
|
||
|
||
# Airflow DAG parsing fails if recursive loops are found, so this method cannot be used from within an Airflow task | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we also add an example here on how to call this and where to call this like mentioned in the PR description. Or should we also create a public docs page listing the steps that we can share with users? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think a short docs will be great and also we could render example in the docs |
||
def identify_recursive_loops(original_dir_path: str, should_delete: bool = False) -> None: | ||
""" | ||
Given a directory, recursively inspect it in search for recursive loops. | ||
If should_delete is set to True, delete the (symbolic links) recursive loops identified. | ||
:param dir_path: Path to the directory to be analysed | ||
:param should_delete: Users should set to True if they want the method to not only identify but also delete these loops. | ||
""" | ||
logger.info(f"Inspecting the directory {original_dir_path} for recursive loops.") | ||
dirs_paths = [] | ||
broken_symlinks_count = 0 | ||
deleted_symlinks_count = 0 | ||
|
||
dir_path = Path(original_dir_path).absolute() | ||
|
||
for root_dir, dirs, files in os.walk(dir_path): | ||
paths = [os.path.join(root_dir, dir_name) for dir_name in dirs] | ||
dirs_paths.extend(paths) | ||
|
||
for subdir_path in dirs_paths: | ||
if os.path.islink(subdir_path): | ||
symlink_target_path = os.path.realpath(subdir_path) | ||
if Path(symlink_target_path) in Path(subdir_path).parents: | ||
logger.warning(f"Detected recursive loop from {subdir_path} to {symlink_target_path}") | ||
broken_symlinks_count += 1 | ||
if should_delete: | ||
logger.info(f"Deleting symbolic link: {subdir_path}") | ||
os.unlink(subdir_path) | ||
deleted_symlinks_count += 1 | ||
|
||
logger.info( | ||
f"After inspecting {dir_path}, identified {broken_symlinks_count} recursive loops and deleted {deleted_symlinks_count} of them." | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser( | ||
description="Clean up local directory from broken symbolic links and recursive loops." | ||
) | ||
parser.add_argument("-p", "--dir-path", help="Path to directory to be inspected", required=True) | ||
parser.add_argument( | ||
"-d", "--delete", help="Delete problems found", action="store_true", required=False, default=False | ||
) | ||
args = parser.parse_args() | ||
identify_recursive_loops(args.dir_path, args.delete) | ||
identify_broken_symbolic_links(args.dir_path, args.delete) | ||
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
""" | ||
We've observed users who had dbt project directories containing symbolic links to files that no longer existed. | ||
|
||
Although this issue was not created by Cosmos itself, since this issue was already observed by two users, we thought it | ||
was useful to give an example DAG illustrating how to clean the problematic directories. | ||
|
||
Assuming the cause of the issue no longer exists, this DAG can be run only once. | ||
""" | ||
|
||
# [START dirty_dir_example] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be great if we could use tag |
||
from datetime import datetime | ||
from pathlib import Path | ||
|
||
from airflow.decorators import dag, task | ||
|
||
from cosmos.cleanup import identify_broken_symbolic_links | ||
|
||
dbt_project_folder = Path(__file__).parent / "dbt" | ||
|
||
|
||
@dag( | ||
schedule_interval="0 0 * * 0", # Runs every Sunday | ||
start_date=datetime(2023, 1, 1), | ||
catchup=False, | ||
tags=["example"], | ||
) | ||
def example_cosmos_cleanup_dir_dag(): | ||
|
||
@task() | ||
def clear_broken_symlinks(session=None): | ||
identify_broken_symbolic_links(dir_path=dbt_project_folder, should_delete=True) | ||
|
||
clear_broken_symlinks() | ||
|
||
|
||
# [END dirty_dir_example] | ||
|
||
example_cosmos_cleanup_dir_dag() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we add some tests for this module or exclude from codecov for the time being?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
+1