From 1564039ec2e9ea0b59dcf489ae2c50612c57336c Mon Sep 17 00:00:00 2001 From: Oliver Beckstein Date: Wed, 7 Dec 2022 15:30:55 -0700 Subject: [PATCH 1/5] replaced deprecated Bio.pairwise2 with Bio.Align.PairwiseAligner - fix #3950 - engineered return tuple as namedtuple to functionally match the originally returned pairwise2.Alignment; we only documented a tuple as return type anyway - update CHANGELOG --- package/CHANGELOG | 4 ++- package/MDAnalysis/analysis/align.py | 47 +++++++++++++++++++++------- 2 files changed, 39 insertions(+), 12 deletions(-) diff --git a/package/CHANGELOG b/package/CHANGELOG index 802a17f0f33..429a6e53336 100644 --- a/package/CHANGELOG +++ b/package/CHANGELOG @@ -14,7 +14,7 @@ The rules for this file: ------------------------------------------------------------------------------ ??/??/?? IAlibay, Luthaf, hmacdope, rafaelpap, jbarnoud, BFedder, aya9aladdin, - jaclark5, jfennick, lilyminium + jaclark5, jfennick, lilyminium, orbeckst * 2.4.0 @@ -70,6 +70,8 @@ Enhancements * Added isolayer selection method (Issue #3845) Changes + * Replaced deprecated Bio.pairwise2 with Bio.align.PairwiseAligner in + MDAnalysis.analysis.align.sequence_alignment (Issue #3950) * Moved `libmdanalysis` cython header to `lib`(Issue #3912, PR #3913) * Auxiliary; determination of representative frames: The default value for cutoff is now None, not -1. Negative values are now turned diff --git a/package/MDAnalysis/analysis/align.py b/package/MDAnalysis/analysis/align.py index 18c0a3f5c2d..af0c3524799 100644 --- a/package/MDAnalysis/analysis/align.py +++ b/package/MDAnalysis/analysis/align.py @@ -187,13 +187,14 @@ import os.path import warnings import logging +import collections import numpy as np import Bio.SeqIO import Bio.AlignIO +import Bio.Align import Bio.Align.Applications -import Bio.pairwise2 import MDAnalysis as mda import MDAnalysis.lib.qcprot as qcp @@ -973,7 +974,7 @@ def sequence_alignment(mobile, reference, match_score=2, mismatch_penalty=-1, The residues in `reference` and `mobile` will be globally aligned. The global alignment uses the Needleman-Wunsch algorithm as - implemented in :mod:`Bio.pairwise2`. The parameters of the dynamic + implemented in :mod:`Bio.Align.PairwiseAligner`. The parameters of the dynamic programming algorithm can be tuned with the keywords. The defaults should be suitable for two similar sequences. For sequences with low sequence identity, more specialized tools such as clustalw, @@ -1004,22 +1005,46 @@ def sequence_alignment(mobile, reference, match_score=2, mismatch_penalty=-1, See Also -------- - BioPython documentation for `pairwise2`_. Alternatively, use + BioPython documentation for `PairwiseAligner`_. Alternatively, use :func:`fasta2select` with :program:`clustalw2` and the option ``is_aligned=False``. - .. _`pairwise2`: http://biopython.org/DIST/docs/api/Bio.pairwise2-module.html + + .. _`PairwiseAligner`: + https://biopython.org/docs/latest/api/Bio.Align.html#Bio.Align.PairwiseAligner + .. versionadded:: 0.10.0 - """ - aln = Bio.pairwise2.align.globalms( - reference.residues.sequence(format="string"), - mobile.residues.sequence(format="string"), - match_score, mismatch_penalty, gap_penalty, gapextension_penalty) - # choose top alignment - return aln[0] + .. versionchanged:: 2.4.0 + Replace use of deprecated :func:`Bio.pairwise2.align.globalms` with + :class:`Bio.Align.PairwiseAligner`. + """ + aligner = Bio.Align.PairwiseAligner( + mode="global", + match_score=match_score, + mismatch_score=mismatch_penalty, + open_gap_score=gap_penalty, + extend_gap_score=gapextension_penalty) + aln = aligner.align(reference.residues.sequence(format="Seq"), + mobile.residues.sequence(format="Seq")) + # choose top alignment with highest score + topalignment = aln[0] + + # reconstruct the results tuple that used to be of type Bio.pairwise2.Alignment + AlignmentTuple = collections.namedtuple( + "Alignment", + ["seqA", "seqB", "score", "start", "end"]) + # extract sequences (there's no obvious way to get the character + # representation with gaps by other means from the new + # Bio.Align.PairwiseAlignment instance) + seqA, _, seqB, _ = topalignment.format().split("\n") + + # start/stop are not particularly meaningful and there's no obvious way to + # get the old pairwise2 start/stop from the new PairwiseAligner output. + return AlignmentTuple(seqA, seqB, topalignment.score, + 0, max(reference.n_residues, mobile.n_residues)) def fasta2select(fastafilename, is_aligned=False, ref_resids=None, target_resids=None, From efed979dbc838b1ab685e56322814542a0f335d8 Mon Sep 17 00:00:00 2001 From: Oliver Beckstein Date: Wed, 7 Dec 2022 16:46:54 -0700 Subject: [PATCH 2/5] use biopython >= 1.80 feature for updated alig.sequence_alignment() --- .github/actions/setup-deps/action.yaml | 2 +- package/CHANGELOG | 1 + package/MDAnalysis/analysis/align.py | 8 ++------ package/requirements.txt | 2 +- package/setup.py | 2 +- 5 files changed, 6 insertions(+), 9 deletions(-) diff --git a/.github/actions/setup-deps/action.yaml b/.github/actions/setup-deps/action.yaml index d805142f985..a2901a375b6 100644 --- a/.github/actions/setup-deps/action.yaml +++ b/.github/actions/setup-deps/action.yaml @@ -15,7 +15,7 @@ inputs: default: false # conda-installed min dependencies biopython: - default: 'biopython' + default: 'biopython>=1.80' codecov: default: 'codecov' cython: diff --git a/package/CHANGELOG b/package/CHANGELOG index 429a6e53336..78fe6616e85 100644 --- a/package/CHANGELOG +++ b/package/CHANGELOG @@ -72,6 +72,7 @@ Enhancements Changes * Replaced deprecated Bio.pairwise2 with Bio.align.PairwiseAligner in MDAnalysis.analysis.align.sequence_alignment (Issue #3950) + * Increased minimal version of biopython to 1.80 (Issue #3950) * Moved `libmdanalysis` cython header to `lib`(Issue #3912, PR #3913) * Auxiliary; determination of representative frames: The default value for cutoff is now None, not -1. Negative values are now turned diff --git a/package/MDAnalysis/analysis/align.py b/package/MDAnalysis/analysis/align.py index af0c3524799..4bd2de97914 100644 --- a/package/MDAnalysis/analysis/align.py +++ b/package/MDAnalysis/analysis/align.py @@ -1036,14 +1036,10 @@ def sequence_alignment(mobile, reference, match_score=2, mismatch_penalty=-1, AlignmentTuple = collections.namedtuple( "Alignment", ["seqA", "seqB", "score", "start", "end"]) - # extract sequences (there's no obvious way to get the character - # representation with gaps by other means from the new - # Bio.Align.PairwiseAlignment instance) - seqA, _, seqB, _ = topalignment.format().split("\n") - # start/stop are not particularly meaningful and there's no obvious way to # get the old pairwise2 start/stop from the new PairwiseAligner output. - return AlignmentTuple(seqA, seqB, topalignment.score, + return AlignmentTuple(topalignment[0], topalignment[1], + topalignment.score, 0, max(reference.n_residues, mobile.n_residues)) def fasta2select(fastafilename, is_aligned=False, diff --git a/package/requirements.txt b/package/requirements.txt index 1400c24af02..44c34303a9f 100644 --- a/package/requirements.txt +++ b/package/requirements.txt @@ -1,4 +1,4 @@ -biopython +biopython>=1.80 codecov cython fasteners diff --git a/package/setup.py b/package/setup.py index 9fb5cd2f778..1aa82e55652 100755 --- a/package/setup.py +++ b/package/setup.py @@ -594,7 +594,7 @@ def long_description(readme): install_requires = [ 'numpy>=1.20.0', - 'biopython>=1.71', + 'biopython>=1.80', 'networkx>=2.0', 'GridDataFormats>=0.4.0', 'mmtf-python>=1.0.0', From cd03bb8e944fee4d72153cd18233c80dcfc33066 Mon Sep 17 00:00:00 2001 From: Oliver Beckstein Date: Wed, 7 Dec 2022 16:58:29 -0700 Subject: [PATCH 3/5] additional docs for sequence_alignment() (The docs show how one should write code if/when we deprecated this function.) --- package/MDAnalysis/analysis/align.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/package/MDAnalysis/analysis/align.py b/package/MDAnalysis/analysis/align.py index 4bd2de97914..3f2a8ac8d16 100644 --- a/package/MDAnalysis/analysis/align.py +++ b/package/MDAnalysis/analysis/align.py @@ -1003,6 +1003,28 @@ def sequence_alignment(mobile, reference, match_score=2, mismatch_penalty=-1, Tuple of top sequence matching output `('Sequence A', 'Sequence B', score, begin, end)` + Notes + ----- + If you prefer to work directly with :mod:`Bio.Align` objects then you can + run your alignment with :class:`Bio.Alig.PairwiseAligner` as :: + + import Bio.Align.PairwiseAligner + + aligner = Bio.Align.PairwiseAligner( + mode="global", + match_score=match_score, + mismatch_score=mismatch_penalty, + open_gap_score=gap_penalty, + extend_gap_score=gapextension_penalty) + aln = aligner.align(reference.residues.sequence(format="Seq"), + mobile.residues.sequence(format="Seq")) + + # choose top alignment with highest score + topalignment = aln[0] + + The ``topalignment`` is a :class:`Bio.Align.PairwiseAlignment` instance + that can be used in your bioinformatics workflows. + See Also -------- BioPython documentation for `PairwiseAligner`_. Alternatively, use @@ -1042,6 +1064,7 @@ def sequence_alignment(mobile, reference, match_score=2, mismatch_penalty=-1, topalignment.score, 0, max(reference.n_residues, mobile.n_residues)) + def fasta2select(fastafilename, is_aligned=False, ref_resids=None, target_resids=None, ref_offset=0, target_offset=0, verbosity=3, From 163c20a952144f2506eadbe23759bec759107a11 Mon Sep 17 00:00:00 2001 From: Oliver Beckstein Date: Wed, 7 Dec 2022 23:38:38 -0700 Subject: [PATCH 4/5] deprecate align.sequence_alignment for removal in 3.0 - add deprecation - test for deprecation warning - combined tests for sequence_alignment in a class for easier removal - update CHANGELOG --- package/CHANGELOG | 3 +- package/MDAnalysis/analysis/align.py | 5 +- .../MDAnalysisTests/analysis/test_align.py | 46 +++++++++++++------ 3 files changed, 37 insertions(+), 17 deletions(-) diff --git a/package/CHANGELOG b/package/CHANGELOG index 78fe6616e85..1d81a82c1f0 100644 --- a/package/CHANGELOG +++ b/package/CHANGELOG @@ -89,10 +89,11 @@ Changes * adding element attribute to TXYZParser if all atom names are valid element symbols (PR #3826) Deprecations + * Deprecated analysis.align.sequence_alignment() for removal in 3.0 (#3950) * Add deprecation warning for `timestep` copying in DCDReader (Issue #3889, PR #3888) * Add deprecation warning for inclusive stop indexing in - MemoryReader.timeseries (#PR 3894) + MemoryReader.timeseries (PR #3894) 08/29/22 IAlibay, PicoCentauri, orbeckst, hmacdope, rmeli, miss77jun, rzhao271, diff --git a/package/MDAnalysis/analysis/align.py b/package/MDAnalysis/analysis/align.py index 3f2a8ac8d16..75746e60b50 100644 --- a/package/MDAnalysis/analysis/align.py +++ b/package/MDAnalysis/analysis/align.py @@ -202,6 +202,7 @@ import MDAnalysis.analysis.rms as rms from MDAnalysis.coordinates.memory import MemoryReader from MDAnalysis.lib.util import get_weights +from MDAnalysis.lib.util import deprecate # remove 3.0 from .base import AnalysisBase @@ -967,7 +968,9 @@ def rmsd(self): warnings.warn(wmsg, DeprecationWarning) return self.results.rmsd - +@deprecate(release="2.4.0", remove="3.0", + message="See the documentation under Notes how directly use" + "Bio.Align.PairwiseAligner with ResidueGroups.") def sequence_alignment(mobile, reference, match_score=2, mismatch_penalty=-1, gap_penalty=-2, gapextension_penalty=-0.1): """Generate a global sequence alignment between two residue groups. diff --git a/testsuite/MDAnalysisTests/analysis/test_align.py b/testsuite/MDAnalysisTests/analysis/test_align.py index 17c15c6d87c..08e284c94a2 100644 --- a/testsuite/MDAnalysisTests/analysis/test_align.py +++ b/testsuite/MDAnalysisTests/analysis/test_align.py @@ -542,22 +542,38 @@ def test_fasta2select_resids(self, tmpdir): assert len(sel['reference']) == 30621, self.error_msg assert len(sel['mobile']) == 30621, self.error_msg -def test_sequence_alignment(): - u = mda.Universe(PSF) - reference = u.atoms - mobile = u.select_atoms("resid 122-159") - aln = align.sequence_alignment(mobile, reference) - - assert len(aln) == 5, "return value has wrong tuple size" - - seqA, seqB, score, begin, end = aln - assert_equal(seqA, reference.residues.sequence(format="string"), - err_msg="reference sequence mismatch") - assert mobile.residues.sequence( - format="string") in seqB, "mobile sequence mismatch" - assert_almost_equal(score, 54.6) - assert_array_equal([begin, end], [0, reference.n_residues]) +class TestSequenceAlignmentFunction: + # remove 3.0 + @staticmethod + @pytest.fixture + def atomgroups(): + universe = mda.Universe(PSF) + reference = universe.atoms + mobile = universe.select_atoms("resid 122-159") + return reference, mobile + + @pytest.mark.filterwarnings("ignore:`sequence_alignment` is deprecated!") + def test_sequence_alignment(self, atomgroups): + reference, mobile = atomgroups + aln = align.sequence_alignment(mobile, reference) + + assert len(aln) == 5, "return value has wrong tuple size" + + seqA, seqB, score, begin, end = aln + assert_equal(seqA, reference.residues.sequence(format="string"), + err_msg="reference sequence mismatch") + assert mobile.residues.sequence( + format="string") in seqB, "mobile sequence mismatch" + assert_almost_equal(score, 54.6) + assert_array_equal([begin, end], [0, reference.n_residues]) + + def test_sequence_alignment_deprecation(self, atomgroups): + reference, mobile = atomgroups + wmsg = ("`sequence_alignment` is deprecated!\n" + "`sequence_alignment` will be removed in release 3.0.") + with pytest.warns(DeprecationWarning, match=wmsg): + align.sequence_alignment(mobile, reference) def test_alignto_reorder_atomgroups(): # Issue 2977 From cd5559173415957def0a07da1cb54d3e13e914e1 Mon Sep 17 00:00:00 2001 From: Oliver Beckstein Date: Thu, 8 Dec 2022 13:02:09 -0700 Subject: [PATCH 5/5] Apply suggestions from code review Co-authored-by: Irfan Alibay --- package/MDAnalysis/analysis/align.py | 4 +++- testsuite/MDAnalysisTests/analysis/test_align.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/package/MDAnalysis/analysis/align.py b/package/MDAnalysis/analysis/align.py index 75746e60b50..9bb9f4798e2 100644 --- a/package/MDAnalysis/analysis/align.py +++ b/package/MDAnalysis/analysis/align.py @@ -968,8 +968,9 @@ def rmsd(self): warnings.warn(wmsg, DeprecationWarning) return self.results.rmsd + @deprecate(release="2.4.0", remove="3.0", - message="See the documentation under Notes how directly use" + message="See the documentation under Notes on how to directly use" "Bio.Align.PairwiseAligner with ResidueGroups.") def sequence_alignment(mobile, reference, match_score=2, mismatch_penalty=-1, gap_penalty=-2, gapextension_penalty=-0.1): @@ -1068,6 +1069,7 @@ def sequence_alignment(mobile, reference, match_score=2, mismatch_penalty=-1, 0, max(reference.n_residues, mobile.n_residues)) + def fasta2select(fastafilename, is_aligned=False, ref_resids=None, target_resids=None, ref_offset=0, target_offset=0, verbosity=3, diff --git a/testsuite/MDAnalysisTests/analysis/test_align.py b/testsuite/MDAnalysisTests/analysis/test_align.py index 08e284c94a2..a3adc835a48 100644 --- a/testsuite/MDAnalysisTests/analysis/test_align.py +++ b/testsuite/MDAnalysisTests/analysis/test_align.py @@ -542,6 +542,7 @@ def test_fasta2select_resids(self, tmpdir): assert len(sel['reference']) == 30621, self.error_msg assert len(sel['mobile']) == 30621, self.error_msg + class TestSequenceAlignmentFunction: # remove 3.0 @@ -565,7 +566,7 @@ def test_sequence_alignment(self, atomgroups): err_msg="reference sequence mismatch") assert mobile.residues.sequence( format="string") in seqB, "mobile sequence mismatch" - assert_almost_equal(score, 54.6) + assert score == pytest.approx(54.6) assert_array_equal([begin, end], [0, reference.n_residues]) def test_sequence_alignment_deprecation(self, atomgroups): @@ -575,6 +576,7 @@ def test_sequence_alignment_deprecation(self, atomgroups): with pytest.warns(DeprecationWarning, match=wmsg): align.sequence_alignment(mobile, reference) + def test_alignto_reorder_atomgroups(): # Issue 2977 u = mda.Universe(PDB_helix)