Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Sort __globals__ dict to make pickle string more deterministic #418

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cloudpickle/cloudpickle_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,8 +155,8 @@ def _function_getstate(func):
}

f_globals_ref = _extract_code_globals(func.__code__)
f_globals = {k: func.__globals__[k] for k in f_globals_ref if k in
func.__globals__}
f_globals = {k: func.__globals__[k] for k in sorted(f_globals_ref)
if k in func.__globals__}

closure_values = (
list(map(_get_cell_contents, func.__closure__))
Expand Down
31 changes: 31 additions & 0 deletions tests/cloudpickle_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import enum
import typing
from functools import wraps
import time

import pytest

Expand All @@ -49,6 +50,7 @@
from cloudpickle.cloudpickle import _extract_class_dict, _whichmodule
from cloudpickle.cloudpickle import _lookup_module_and_qualname

from .testutils import subprocess_pickle_string
from .testutils import subprocess_pickle_echo
from .testutils import assert_run_python_script
from .testutils import subprocess_worker
Expand All @@ -57,6 +59,9 @@


_TEST_GLOBAL_VARIABLE = "default_value"
_TEST_GLOBAL_VARIABLE2 = "another_value"

exec("def _TEST_BIG_GLOBAL_SPACE():\n return %s" % ", ".join([f"a{i}" for i in range(1000)]))


class RaiserOnPickle(object):
Expand Down Expand Up @@ -2321,6 +2326,32 @@ def __type__(self):
o = MyClass()
pickle_depickle(o, protocol=self.protocol)

@pytest.mark.skipif(
sys.version_info < (3, 6, 0),
reason="Dict determinism is a lost cause in Python < 3.6")
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With 3.5 EOL, it does not seem worth putting any effort into this. Downstream tools that care about determinism have likely been 3.6+ anyway.

def test_sorted_globals(self):
vals = set()

def func_with_globals():
return _TEST_GLOBAL_VARIABLE + _TEST_GLOBAL_VARIABLE2

for i in range(5):
vals.add(
subprocess_pickle_string(func_with_globals,
protocol=self.protocol,
add_env={"PYTHONHASHSEED": str(i)}))
assert len(vals) == 1

def test_efficient_sorted_globals(self):
# Non regression test to demonstrate that large numbers of globals
# do not cause slowdown
gvars = set(f"a{i}" for i in range(1000))
assert cloudpickle.cloudpickle._extract_code_globals(
_TEST_BIG_GLOBAL_SPACE.__code__) == gvars
tic = time.time()
subprocess_pickle_string(_TEST_BIG_GLOBAL_SPACE, protocol=self.protocol)
assert time.time() - tic < 0.5
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would rather not put tests that depend on the runtime because the CI workers can overloaded from time to time and those tests would fail at random.

Instead just run a one-off benchmark and post the results in the comments of this PR.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No need to address this if we chose to go for #428 (which I think we should ;).



class Protocol2CloudPickleTest(CloudPickleTest):

Expand Down
36 changes: 28 additions & 8 deletions tests/testutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,24 +38,25 @@ def _make_cwd_env():
return cloudpickle_repo_folder, env


def subprocess_pickle_echo(input_data, protocol=None, timeout=TIMEOUT):
"""Echo function with a child Python process
def subprocess_pickle_string(input_data, protocol=None, timeout=TIMEOUT,
add_env=None):
"""Retrieve pickle string of an object generated by a child Python process

Pickle the input data into a buffer, send it to a subprocess via
stdin, expect the subprocess to unpickle, re-pickle that data back
and send it back to the parent process via stdout for final unpickling.
and send it back to the parent process via stdout.

>>> subprocess_pickle_echo([1, 'a', None])
[1, 'a', None]
>>> testutils.subprocess_pickle_string([1, 'a', None], protocol=2)
b'\x80\x02]q\x00(K\x01X\x01\x00\x00\x00aq\x01Ne.'

"""
# run then pickle_echo(protocol=protocol) in __main__:

# Protect stderr from any warning, as we will assume an error will happen
# if it is not empty. A concrete example is pytest using the imp module,
# which is deprecated in python 3.8
cmd = [sys.executable, '-W ignore', __file__, "--protocol", str(protocol)]
cwd, env = _make_cwd_env()
if add_env:
env.update(add_env)
proc = Popen(cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=cwd, env=env,
bufsize=4096)
pickle_string = dumps(input_data, protocol=protocol)
Expand All @@ -67,14 +68,33 @@ def subprocess_pickle_echo(input_data, protocol=None, timeout=TIMEOUT):
message = "Subprocess returned %d: " % proc.returncode
message += err.decode('utf-8')
raise RuntimeError(message)
return loads(out)
return out
except TimeoutExpired as e:
proc.kill()
out, err = proc.communicate()
message = u"\n".join([out.decode('utf-8'), err.decode('utf-8')])
raise RuntimeError(message) from e


def subprocess_pickle_echo(input_data, protocol=None, timeout=TIMEOUT,
add_env=None):
"""Echo function with a child Python process

Pickle the input data into a buffer, send it to a subprocess via
stdin, expect the subprocess to unpickle, re-pickle that data back
and send it back to the parent process via stdout for final unpickling.

>>> subprocess_pickle_echo([1, 'a', None])
[1, 'a', None]

"""
out = subprocess_pickle_string(input_data,
protocol=protocol,
timeout=timeout,
add_env=add_env)
return loads(out)


def _read_all_bytes(stream_in, chunk_size=4096):
all_data = b""
while True:
Expand Down