diff --git a/.bandit.yml b/.bandit.yml index 69acb67..841bd6f 100644 --- a/.bandit.yml +++ b/.bandit.yml @@ -1,2 +1,5 @@ skips: - B107 +- B101 +- B311 +exclude_dirs: ['tests'] \ No newline at end of file diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 0000000..e86e521 --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1,3 @@ +# This is a list of commits that git blame should ignore +# running pre-commit on all the code +47be6dd35faf78f06b6183ea9dad6b5f78f5c756 \ No newline at end of file diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index a7d8f92..547559e 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -20,16 +20,10 @@ jobs: TOXENV: docs - python-version: "3.12" env: - TOXENV: flake8 + TOXENV: pre-commit - python-version: "3.12" env: TOXENV: pylint - - python-version: "3.12" - env: - TOXENV: security - - python-version: "3.12" - env: - TOXENV: black - python-version: "3.12" env: TOXENV: typing diff --git a/.isort.cfg b/.isort.cfg new file mode 100644 index 0000000..6860bdb --- /dev/null +++ b/.isort.cfg @@ -0,0 +1,2 @@ +[settings] +profile = black \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..bf7ebb0 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,18 @@ +repos: +- repo: https://github.com/PyCQA/bandit + rev: 1.7.7 + hooks: + - id: bandit + args: [-r, -c, .bandit.yml] +- repo: https://github.com/PyCQA/flake8 + rev: 7.0.0 + hooks: + - id: flake8 +- repo: https://github.com/psf/black.git + rev: 24.1.1 + hooks: + - id: black +- repo: https://github.com/pycqa/isort + rev: 5.13.2 + hooks: + - id: isort diff --git a/docs/conf.py b/docs/conf.py index cb57d42..0d6ec28 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -9,87 +9,88 @@ # All configuration values have a default; values that are commented out # serve to show the default. -import sys, os +import os +import sys # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -sys.path.insert(0, os.path.abspath('..')) +sys.path.insert(0, os.path.abspath("..")) # -- General configuration ----------------------------------------------------- # If your documentation needs a minimal Sphinx version, state it here. -#needs_sphinx = '1.0' +# needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be extensions # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = [ - 'hoverxref.extension', - 'notfound.extension', - 'sphinx.ext.autodoc', - 'sphinx.ext.doctest', - 'sphinx.ext.intersphinx', - 'sphinx.ext.viewcode', + "hoverxref.extension", + "notfound.extension", + "sphinx.ext.autodoc", + "sphinx.ext.doctest", + "sphinx.ext.intersphinx", + "sphinx.ext.viewcode", ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix of source filenames. -source_suffix = '.rst' +source_suffix = ".rst" # The encoding of source files. -#source_encoding = 'utf-8-sig' +# source_encoding = 'utf-8-sig' # The master toctree document. -master_doc = 'index' +master_doc = "index" # General information about the project. -project = 'w3lib' -copyright = '2014, w3lib developers' +project = "w3lib" +copyright = "2014, w3lib developers" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The full version, including alpha/beta/rc tags. -release = '2.1.2' +release = "2.1.2" # The short X.Y version. -version = '.'.join(release.split('.')[:2]) +version = ".".join(release.split(".")[:2]) # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. -#language = None +# language = None # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: -#today = '' +# today = '' # Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' +# today_fmt = '%B %d, %Y' # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. -exclude_patterns = ['_build'] +exclude_patterns = ["_build"] # The reST default role (used for this markup: `text`) to use for all documents. -#default_role = None +# default_role = None # If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True +# add_function_parentheses = True # If true, the current module name will be prepended to all description # unit titles (such as .. function::). -#add_module_names = True +# add_module_names = True # If true, sectionauthor and moduleauthor directives will be shown in the # output. They are ignored by default. -#show_authors = False +# show_authors = False # The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' +pygments_style = "sphinx" # A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] +# modindex_common_prefix = [] # -- Options for HTML output --------------------------------------------------- @@ -101,26 +102,26 @@ # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. -#html_theme_options = {} +# html_theme_options = {} # Add any paths that contain custom themes here, relative to this directory. -#html_theme_path = [] +# html_theme_path = [] # The name for this set of Sphinx documents. If None, it defaults to # " v documentation". -#html_title = None +# html_title = None # A shorter title for the navigation bar. Default is the same as html_title. -#html_short_title = None +# html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. -#html_logo = None +# html_logo = None # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. -#html_favicon = None +# html_favicon = None # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, @@ -129,101 +130,95 @@ # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' +# html_last_updated_fmt = '%b %d, %Y' # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. -#html_use_smartypants = True +# html_use_smartypants = True # Custom sidebar templates, maps document names to template names. -#html_sidebars = {} +# html_sidebars = {} # Additional templates that should be rendered to pages, maps page names to # template names. -#html_additional_pages = {} +# html_additional_pages = {} # If false, no module index is generated. -#html_domain_indices = True +# html_domain_indices = True # If false, no index is generated. -#html_use_index = True +# html_use_index = True # If true, the index is split into individual pages for each letter. -#html_split_index = False +# html_split_index = False # If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True +# html_show_sourcelink = True # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -#html_show_sphinx = True +# html_show_sphinx = True # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -#html_show_copyright = True +# html_show_copyright = True # If true, an OpenSearch description file will be output, and all pages will # contain a tag referring to it. The value of this option must be the # base URL from which the finished HTML is served. -#html_use_opensearch = '' +# html_use_opensearch = '' # This is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = None +# html_file_suffix = None # Output file base name for HTML help builder. -htmlhelp_basename = 'w3libdoc' +htmlhelp_basename = "w3libdoc" # -- Options for LaTeX output -------------------------------------------------- latex_elements = { -# The paper size ('letterpaper' or 'a4paper'). -#'papersize': 'letterpaper', - -# The font size ('10pt', '11pt' or '12pt'). -#'pointsize': '10pt', - -# Additional stuff for the LaTeX preamble. -#'preamble': '', + # The paper size ('letterpaper' or 'a4paper'). + # 'papersize': 'letterpaper', + # The font size ('10pt', '11pt' or '12pt'). + # 'pointsize': '10pt', + # Additional stuff for the LaTeX preamble. + # 'preamble': '', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, documentclass [howto/manual]). latex_documents = [ - ('index', 'w3lib.tex', 'w3lib Documentation', - 'w3lib developers', 'manual'), + ("index", "w3lib.tex", "w3lib Documentation", "w3lib developers", "manual"), ] # The name of an image file (relative to this directory) to place at the top of # the title page. -#latex_logo = None +# latex_logo = None # For "manual" documents, if this is true, then toplevel headings are parts, # not chapters. -#latex_use_parts = False +# latex_use_parts = False # If true, show page references after internal links. -#latex_show_pagerefs = False +# latex_show_pagerefs = False # If true, show URL addresses after external links. -#latex_show_urls = False +# latex_show_urls = False # Documents to append as an appendix to all manuals. -#latex_appendices = [] +# latex_appendices = [] # If false, no module index is generated. -#latex_domain_indices = True +# latex_domain_indices = True # -- Options for manual page output -------------------------------------------- # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). -man_pages = [ - ('index', 'w3lib', 'w3lib Documentation', - ['w3lib developers'], 1) -] +man_pages = [("index", "w3lib", "w3lib Documentation", ["w3lib developers"], 1)] # If true, show URL addresses after external links. -#man_show_urls = False +# man_show_urls = False # -- Options for Texinfo output ------------------------------------------------ @@ -232,27 +227,33 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - ('index', 'w3lib', 'w3lib Documentation', - 'w3lib developers', 'w3lib', 'One line description of project.', - 'Miscellaneous'), + ( + "index", + "w3lib", + "w3lib Documentation", + "w3lib developers", + "w3lib", + "One line description of project.", + "Miscellaneous", + ), ] # Documents to append as an appendix to all manuals. -#texinfo_appendices = [] +# texinfo_appendices = [] # If false, no module index is generated. -#texinfo_domain_indices = True +# texinfo_domain_indices = True # How to display URL addresses: 'footnote', 'no', or 'inline'. -#texinfo_show_urls = 'footnote' +# texinfo_show_urls = 'footnote' # Example configuration for intersphinx: refer to the Python standard library. intersphinx_mapping = { - 'pytest': ('https://docs.pytest.org/en/latest', None), - 'python': ('https://docs.python.org/3', None), - 'scrapy': ('https://scrapy.readthedocs.io/en/latest', None), - 'tox': ('https://tox.readthedocs.io/en/latest', None), + "pytest": ("https://docs.pytest.org/en/latest", None), + "python": ("https://docs.python.org/3", None), + "scrapy": ("https://scrapy.readthedocs.io/en/latest", None), + "tox": ("https://tox.readthedocs.io/en/latest", None), } diff --git a/setup.py b/setup.py index c8728de..9825882 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,4 @@ -from setuptools import setup, find_packages - +from setuptools import find_packages, setup setup( name="w3lib", diff --git a/tests/test_encoding.py b/tests/test_encoding.py index 0389e78..3a2f5c4 100644 --- a/tests/test_encoding.py +++ b/tests/test_encoding.py @@ -1,11 +1,11 @@ import codecs import unittest -from typing import Optional, Union, List, Any +from typing import Any, List, Optional, Union from w3lib.encoding import ( html_body_declared_encoding, - http_content_type_encoding, html_to_unicode, + http_content_type_encoding, read_bom, resolve_encoding, to_unicode, diff --git a/tests/test_http.py b/tests/test_http.py index 76a1ff1..2125e9e 100644 --- a/tests/test_http.py +++ b/tests/test_http.py @@ -1,5 +1,6 @@ import unittest from collections import OrderedDict + from w3lib.http import ( HeadersDictInput, basic_auth_header, diff --git a/tests/test_url.py b/tests/test_url.py index 2960d5e..ca84745 100644 --- a/tests/test_url.py +++ b/tests/test_url.py @@ -1,8 +1,8 @@ -import sys import os +import sys import unittest from inspect import isclass -from typing import Optional, Union, Type, Callable, Tuple, List +from typing import Callable, List, Optional, Tuple, Type, Union from urllib.parse import urlparse import pytest @@ -27,8 +27,8 @@ path_to_file_uri, safe_download_url, safe_url_string, - url_query_parameter, url_query_cleaner, + url_query_parameter, ) # Test cases for URL-to-safe-URL conversions with a URL and an encoding as @@ -364,9 +364,11 @@ def _test_safe_url_string( @pytest.mark.parametrize( "encoding,url,output", tuple( - case - if case[:2] not in KNOWN_SAFE_URL_STRING_ENCODING_ISSUES - else pytest.param(*case, marks=pytest.mark.xfail(strict=True)) + ( + case + if case[:2] not in KNOWN_SAFE_URL_STRING_ENCODING_ISSUES + else pytest.param(*case, marks=pytest.mark.xfail(strict=True)) + ) for case in SAFE_URL_ENCODING_CASES ), ) @@ -425,9 +427,11 @@ def test_safe_url_string_encoding( @pytest.mark.parametrize( "url,output", tuple( - case - if case[0] not in KNOWN_SAFE_URL_STRING_URL_ISSUES - else pytest.param(*case, marks=pytest.mark.xfail(strict=True)) + ( + case + if case[0] not in KNOWN_SAFE_URL_STRING_URL_ISSUES + else pytest.param(*case, marks=pytest.mark.xfail(strict=True)) + ) for case in SAFE_URL_URL_CASES ), ) @@ -713,7 +717,8 @@ def test_safe_url_string_preserve_nonfragment_hash(self): def test_safe_url_string_encode_idna_domain_with_port(self): self.assertEqual( - safe_url_string("http://新华网.中国:80"), "http://xn--xkrr14bows.xn--fiqs8s:80" + safe_url_string("http://新华网.中国:80"), + "http://xn--xkrr14bows.xn--fiqs8s:80", ) def test_safe_url_string_encode_idna_domain_with_username_password_and_port_number( diff --git a/tox.ini b/tox.ini index 5682201..e97b62f 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,7 @@ # and then run "tox" from this directory. [tox] -envlist = py38, py39, py310, py311, py312, pypy3, docs, security, flake8, pylint, black, typing +envlist = py38, py39, py310, py311, py312, pypy3, docs, pylint, typing, pre-commit [testenv] deps = @@ -16,12 +16,6 @@ commands = --cov=w3lib --cov-report=term --cov-report=xml \ {posargs:w3lib tests} -[testenv:security] -deps = - bandit -commands = - bandit -r -c .bandit.yml {posargs:w3lib} - [testenv:typing] basepython = python3 deps = @@ -31,14 +25,6 @@ deps = commands = mypy --strict {posargs: w3lib tests} -[testenv:flake8] -basepython = python3 -deps = - flake8==6.1.0 -commands = - flake8 \ - {posargs:w3lib tests setup.py} - [testenv:pylint] deps = {[testenv]deps} @@ -46,14 +32,13 @@ deps = commands = pylint conftest.py docs setup.py tests w3lib -[testenv:black] -deps = - black==22.6.0 -commands = - black {posargs:--check conftest.py setup.py tests w3lib} - [testenv:docs] changedir = docs deps = -rdocs/requirements.txt commands = sphinx-build -W -b html . {envtmpdir}/html + +[testenv:pre-commit] +deps = pre-commit +commands = pre-commit run --all-files --show-diff-on-failure +skip_install = true diff --git a/w3lib/encoding.py b/w3lib/encoding.py index 7d46d78..8877c6f 100644 --- a/w3lib/encoding.py +++ b/w3lib/encoding.py @@ -1,13 +1,14 @@ """ Functions for handling encoding of web pages """ -import re + import codecs import encodings +import re from typing import Callable, Match, Optional, Tuple, Union, cast -from w3lib._types import AnyUnicodeError, StrOrBytes import w3lib.util +from w3lib._types import AnyUnicodeError, StrOrBytes _HEADER_ENCODING_RE = re.compile(r"charset=([\w-]+)", re.I) diff --git a/w3lib/html.py b/w3lib/html.py index f0f0184..760c0da 100644 --- a/w3lib/html.py +++ b/w3lib/html.py @@ -4,12 +4,12 @@ import re from html.entities import name2codepoint -from typing import Iterable, Match, AnyStr, Optional, Pattern, Tuple, Union +from typing import AnyStr, Iterable, Match, Optional, Pattern, Tuple, Union from urllib.parse import urljoin -from w3lib.util import to_unicode -from w3lib.url import safe_url_string from w3lib._types import StrOrBytes +from w3lib.url import safe_url_string +from w3lib.util import to_unicode _ent_re = re.compile( r"&((?P[a-z\d]+)|#(?P\d+)|#x(?P[a-f\d]+))(?P;?)", diff --git a/w3lib/http.py b/w3lib/http.py index a3e4e17..bdb3f66 100644 --- a/w3lib/http.py +++ b/w3lib/http.py @@ -1,5 +1,6 @@ from base64 import b64encode -from typing import Any, List, MutableMapping, Optional, AnyStr, Sequence, Union, Mapping +from typing import Any, AnyStr, List, Mapping, MutableMapping, Optional, Sequence, Union + from w3lib.util import to_bytes, to_unicode HeadersDictInput = Mapping[bytes, Union[Any, Sequence[bytes]]] diff --git a/w3lib/url.py b/w3lib/url.py index 485e694..52cf6ad 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -2,6 +2,7 @@ This module contains general purpose URL functions not found in the standard library. """ + import base64 import codecs import os @@ -9,7 +10,6 @@ import re import string from typing import ( - cast, Callable, Dict, List, @@ -18,12 +18,15 @@ Sequence, Tuple, Union, + cast, ) +from urllib.parse import _coerce_args # type: ignore from urllib.parse import ( + ParseResult, parse_qs, parse_qsl, - ParseResult, quote, + unquote, unquote_to_bytes, urldefrag, urlencode, @@ -31,15 +34,13 @@ urlsplit, urlunparse, urlunsplit, - unquote, ) -from urllib.parse import _coerce_args # type: ignore from urllib.request import pathname2url, url2pathname -from .util import to_unicode from ._infra import _ASCII_TAB_OR_NEWLINE, _C0_CONTROL_OR_SPACE from ._types import AnyUnicodeError, StrOrBytes from ._url import _SPECIAL_SCHEMES +from .util import to_unicode # error handling function for bytes-to-Unicode decoding errors with URLs diff --git a/w3lib/util.py b/w3lib/util.py index 70f4ef5..61426e8 100644 --- a/w3lib/util.py +++ b/w3lib/util.py @@ -1,5 +1,5 @@ -from warnings import warn from typing import Optional +from warnings import warn from w3lib._types import StrOrBytes