pre-commit · asottile · Mar 30, 2019 · Mar 30, 2019
diff --git a/pre_commit_hooks/check_docstring_first.py b/pre_commit_hooks/check_docstring_first.py
@@ -8,14 +8,23 @@
 from typing import Optional
 from typing import Sequence
 
+import six
 
-NON_CODE_TOKENS = frozenset((
-    tokenize.COMMENT, tokenize.ENDMARKER, tokenize.NEWLINE, tokenize.NL,
-))
+if six.PY2:  # pragma: no cover (PY2)
+    from tokenize import generate_tokens as tokenize_tokenize
+    OTHER_NON_CODE = ()
+else:  # pragma: no cover (PY3)
+    from tokenize import tokenize as tokenize_tokenize
+    OTHER_NON_CODE = (tokenize.ENCODING,)
+
+NON_CODE_TOKENS = frozenset(
+    (tokenize.COMMENT, tokenize.ENDMARKER, tokenize.NEWLINE, tokenize.NL) +
+    OTHER_NON_CODE,
+)
 
 
 def check_docstring_first(src, filename='<unknown>'):
-    # type: (str, str) -> int
+    # type: (bytes, str) -> int
     """Returns nonzero if the source has what looks like a docstring that is
     not at the beginning of the source.
 
@@ -25,7 +34,7 @@ def check_docstring_first(src, filename='<unknown>'):
     found_docstring_line = None
     found_code_line = None
 
-    tok_gen = tokenize.generate_tokens(io.StringIO(src).readline)
+    tok_gen = tokenize_tokenize(io.BytesIO(src).readline)
     for tok_type, _, (sline, scol), _, _ in tok_gen:
         # Looks like a docstring!
         if tok_type == tokenize.STRING and scol == 0:
@@ -61,7 +70,7 @@ def main(argv=None):  # type: (Optional[Sequence[str]]) -> int
     retv = 0
 
     for filename in args.filenames:
-        with io.open(filename, encoding='UTF-8') as f:
+        with open(filename, 'rb') as f:
             contents = f.read()
         retv |= check_docstring_first(contents, filename=filename)
 

diff --git a/tests/check_docstring_first_test.py b/tests/check_docstring_first_test.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 from __future__ import absolute_import
 from __future__ import unicode_literals
 
@@ -10,37 +11,37 @@
 # Contents, expected, expected_output
 TESTS = (
     # trivial
-    ('', 0, ''),
+    (b'', 0, ''),
     # Acceptable
-    ('"foo"', 0, ''),
+    (b'"foo"', 0, ''),
     # Docstring after code
     (
-        'from __future__ import unicode_literals\n'
-        '"foo"\n',
+        b'from __future__ import unicode_literals\n'
+        b'"foo"\n',
         1,
         '{filename}:2 Module docstring appears after code '
         '(code seen on line 1).\n',
     ),
     # Test double docstring
     (
-        '"The real docstring"\n'
-        'from __future__ import absolute_import\n'
-        '"fake docstring"\n',
+        b'"The real docstring"\n'
+        b'from __future__ import absolute_import\n'
+        b'"fake docstring"\n',
         1,
         '{filename}:3 Multiple module docstrings '
         '(first docstring on line 1).\n',
     ),
     # Test multiple lines of code above
     (
-        'import os\n'
-        'import sys\n'
-        '"docstring"\n',
+        b'import os\n'
+        b'import sys\n'
+        b'"docstring"\n',
         1,
         '{filename}:3 Module docstring appears after code '
         '(code seen on line 1).\n',
     ),
     # String literals in expressions are ok.
-    ('x = "foo"\n', 0, ''),
+    (b'x = "foo"\n', 0, ''),
 )
 
 
@@ -58,6 +59,13 @@ def test_unit(capsys, contents, expected, expected_out):
 @all_tests
 def test_integration(tmpdir, capsys, contents, expected, expected_out):
     f = tmpdir.join('test.py')
-    f.write(contents)
+    f.write_binary(contents)
     assert main([f.strpath]) == expected
     assert capsys.readouterr()[0] == expected_out.format(filename=f.strpath)
+
+
+def test_arbitrary_encoding(tmpdir):
+    f = tmpdir.join('f.py')
+    contents = '# -*- coding: cp1252\nx = "£"'.encode('cp1252')
+    f.write_binary(contents)
+    assert main([f.strpath]) == 0