Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Raise exception when open with write mode in call stack #140

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,21 @@ Most methods and properties from `pathlib.Path` are supported except for the one
| `key` | ❌ | ✅ | ❌ |
| `md5` | ✅ | ❌ | ❌ |


## Writing to cloud files

**Warning:** You can't call `open(CloudPath("s3://path"), "w")` and have write to the cloud file (reading works fine with the built-in open). Instead of using the Python built-in open, you must use `CloudPath("s3://path").open("w")`. For more iformation, see [#128](https://github.com/drivendataorg/cloudpathlib/issues/128) and [#140](https://github.com/drivendataorg/cloudpathlib/pull/140).

We try to detect this scenario and raise a `BuiltInOpenWriteError` exception for you. There is a slight performance hit for this check, and if _you are sure_ that either (1) you are not writing to cloud files or (2) you are writing, but you are using the `CloudPath.open` method every time, you can skip this check by setting the environment variable `CLOUDPATHLIB_CHECK_UNSAFE_OPEN=False`.

If you are passing the `CloudPath` into another library and you see `BuiltInOpenWriteError`, try opening and passing the buffer into that function instead:

```python
with CloudPath("s3://bucket/path_to_write.txt").open("w") as fp:
function_that_writes(fp)
```


----

<sup>Icon made by <a href="https://www.flaticon.com/authors/srip" title="srip">srip</a> from <a href="https://www.flaticon.com/" title="Flaticon">www.flaticon.com</a>.</sup>
Expand Down
77 changes: 77 additions & 0 deletions cloudpathlib/cloudpath.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
import abc
import ast
from collections import defaultdict
import collections.abc
import fnmatch
import inspect
import os
from pathlib import Path, PosixPath, PurePosixPath, WindowsPath
from textwrap import dedent
from typing import Any, IO, Iterable, Optional, TYPE_CHECKING, Union
from urllib.parse import urlparse
from warnings import warn

from .exceptions import (
BuiltInOpenWriteError,
ClientMismatchError,
CloudPathFileExistsError,
CloudPathIsADirectoryError,
Expand All @@ -27,6 +31,8 @@
if TYPE_CHECKING:
from .client import Client

CHECK_UNSAFE_OPEN = not (os.getenv("CLOUDPATHLIB_CHECK_UNSAFE_OPEN", "True").lower() == "false")
pjbull marked this conversation as resolved.
Show resolved Hide resolved


class CloudImplementation:
def __init__(self):
Expand Down Expand Up @@ -205,6 +211,38 @@ def __eq__(self, other: Any) -> bool:
return isinstance(other, type(self)) and str(self) == str(other)

def __fspath__(self):
# make sure that we're not getting called by the builtin open
# in a write mode, since we won't actually write to the cloud in
# that scenario
if CHECK_UNSAFE_OPEN:
frame = inspect.currentframe().f_back

caller_src = inspect.getsource(frame)

# also get local variables in the frame
caller_local_variables = frame.f_locals

# get all the instances in the previous frame of our class
instances_of_type = [
varname
for varname, instance in caller_local_variables.items()
if isinstance(instance, type(self))
]

# Walk the AST of the previous frame source and see if
# open is called with a variable of our type...
if any(
_is_open_call_write_with_var(n, var_names=instances_of_type, var_type=type(self))
for n in ast.walk(ast.parse(dedent(caller_src)))
):
raise BuiltInOpenWriteError(
"Detected the use of built-in open function with a cloud path and write mode. "
"Cloud paths do not support the open function in write mode; "
"please use the .open() method instead.\n\n"
"NOTE: Exact line of this error may be incorrect, but open-for-write call is "
"within the same scope. (Skip this check with env var CLOUDPATHLIB_CHECK_UNSAFE_OPEN=False)."
)

if self.is_file():
self._refresh_cache(force_overwrite_from_cloud=False)
return str(self._local)
Expand Down Expand Up @@ -749,3 +787,42 @@ def _resolve(path: PurePosixPath) -> str:
newpath = newpath + sep + name

return newpath or sep


# This function is used to check if our `__fspath__` implementation has been
# called in a writeable mode from the built-in open function.
def _is_open_call_write_with_var(ast_node, var_names=None, var_type=None):
"""For a given AST node, check that the node is a `Call`, and that the
call is to a function with the name `open`, and that the last argument

If passed, return True if the first argument is a variable with a name in var_names.

If passed, return True if the first arg is a Call to instantiate var_type.
"""
if not isinstance(ast_node, ast.Call):
return False
if not hasattr(ast_node, "func"):
return False
if not hasattr(ast_node.func, "id"):
return False
if ast_node.func.id != "open":
return False

# we are in an open call, get the path as first arg
path = ast_node.args[0]

# get the mode as second arg or kwarg where arg==mode
mode = (
ast_node.args[1]
if len(ast_node.args) >= 2
else [kwarg for kwarg in ast_node.keywords if kwarg.arg == "mode"][0].value
)

# Ensure the path is either a call to instantiate var_type or
# the name of a variable we know is of the right type
path_is_of_type = (isinstance(path, ast.Call) and path.func.id == var_type.__name__) or (
hasattr(path, "id") and (path.id in var_names)
)

WRITE_MODES = {"r+", "w", "w+", "a", "a+", "rb+", "wb", "wb+", "ab", "ab+"}
pjbull marked this conversation as resolved.
Show resolved Hide resolved
return (mode.s in WRITE_MODES) and path_is_of_type
4 changes: 4 additions & 0 deletions cloudpathlib/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@ class AnyPathTypeError(CloudPathException, TypeError):
pass


class BuiltInOpenWriteError(CloudPathException):
pass


class ClientMismatchError(CloudPathException, ValueError):
pass

Expand Down
52 changes: 50 additions & 2 deletions tests/test_cloudpath_file_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,11 @@

import pytest

from cloudpathlib.exceptions import CloudPathIsADirectoryError, DirectoryNotEmptyError
from cloudpathlib.exceptions import (
BuiltInOpenWriteError,
CloudPathIsADirectoryError,
DirectoryNotEmptyError,
)


def test_file_discovery(rig):
Expand Down Expand Up @@ -109,7 +113,51 @@ def test_fspath(rig):
assert os.fspath(p) == p.fspath


def test_os_open(rig):
def test_os_open_read(rig):
p = rig.create_cloud_path("dir_0/file0_0.txt")
with open(p, "r") as f:
assert f.readable()


# entire function is passed as source, so check separately
# that all of the built in open write modes fail
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this is interesting. I thought it was a feature of my previous version of the test that the function would have both safe and unsafe opens, and only fail for the unsafe one. Is it the case that if we had a safe open first and unsafeopen second in the same function, it would fail on the first one because the checking code would see the second open?

Anyways, I think some additional things we want to test:

  • open(pathlib_path, "w") and open(cloud_path, "w")` in the same function
  • monkeypatching that constant to False disables the check

def test_os_open_write0(rig):
p = rig.create_cloud_path("dir_0/file0_0.txt")

with pytest.raises(BuiltInOpenWriteError):
with open(p, "w") as f:
assert f.readable()


def test_os_open_write1(rig):
p = rig.create_cloud_path("dir_0/file0_0.txt")

with pytest.raises(BuiltInOpenWriteError):
with open(p, "wb") as f:
assert f.readable()


def test_os_open_write2(rig):
p = rig.create_cloud_path("dir_0/file0_0.txt")

with pytest.raises(BuiltInOpenWriteError):
with open(p, "a") as f:
assert f.readable()


def test_os_open_write3(rig):
p = rig.create_cloud_path("dir_0/file0_0.txt")

with pytest.raises(BuiltInOpenWriteError):
with open(p, "r+") as f:
assert f.readable()


def test_os_open_write4(rig):
p = rig.create_cloud_path("dir_0/file0_0.txt")
with pytest.raises(BuiltInOpenWriteError):
with open(
p,
"w",
) as f:
assert f.readable()