Skip to content

Commit

Permalink
Implement adapters that disallow internal matches
Browse files Browse the repository at this point in the history
Could also be considered "less strict anchoring"

Specify -a ADAPTERX (with a literal X) to disallow matches where the
adapter sequence in the read is followed by further nucleotides.

Specify -g XADAPTER to disallow matches where the adapter sequence is
preceded by nucleotides.

Needs documentation.

Closes #53
  • Loading branch information
marcelm committed Apr 30, 2018
1 parent fe062ab commit 40907f8
Show file tree
Hide file tree
Showing 9 changed files with 222 additions and 66 deletions.
6 changes: 6 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@ Changes
development version
-------------------

* Close :issue:`53`: Implement adapters that disallow internal matches.
This is a bit like anchoring, but less strict: The adapter sequence
can appear at different lengths, but must always be at one of the ends.
Use ``-a ADAPTERX`` (with a literal ``X``) to disallow internal matches
for a 3' adapter. Use ``-g XADAPTER`` to disallow for a 5' adapter.
Mnemonic: The ``X`` is not allowed to “shift into” the read.
* :user:`klugem` contributed PR :issue:`299`: The ``--length`` option (and its
alias ``-l``) can now be used with negative lengths, which will remove bases
from the beginning of the read instead of from the end.
Expand Down
4 changes: 2 additions & 2 deletions src/cutadapt/_align.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ class DPMatrix:
"""
Representation of the dynamic-programming matrix.
This used only when debugging is enabled in the Aligner class since the
This is used only when debugging is enabled in the Aligner class since the
matrix is normally not stored in full.
Entries in the matrix may be None, in which case that value was not
Expand Down Expand Up @@ -226,7 +226,7 @@ cdef class Aligner:
"""
def __set__(self, value):
if value < 1:
raise ValueError('Insertion/deletion cost must be at leat 1')
raise ValueError('Insertion/deletion cost must be at least 1')
self._insertion_cost = value
self._deletion_cost = value

Expand Down
201 changes: 143 additions & 58 deletions src/cutadapt/adapters.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,16 @@
from cutadapt.seqio import FastaReader


# Constants for the find_best_alignment function.
# Constants for the Aligner.locate() function.
# The function is called with SEQ1 as the adapter, SEQ2 as the read.
# TODO get rid of those constants, use strings instead
BACK = align.START_WITHIN_SEQ2 | align.STOP_WITHIN_SEQ2 | align.STOP_WITHIN_SEQ1
FRONT = align.START_WITHIN_SEQ2 | align.STOP_WITHIN_SEQ2 | align.START_WITHIN_SEQ1
PREFIX = align.STOP_WITHIN_SEQ2
SUFFIX = align.START_WITHIN_SEQ2
# Just like FRONT/BACK, but without internal matches
FRONT_NOT_INTERNAL = align.START_WITHIN_SEQ1 | align.STOP_WITHIN_SEQ2
BACK_NOT_INTERNAL = align.START_WITHIN_SEQ2 | align.STOP_WITHIN_SEQ1
ANYWHERE = align.SEMIGLOBAL
LINKED = 'linked'

Expand Down Expand Up @@ -73,6 +76,67 @@ def __init__(self, colorspace=False, **kwargs):
self.constructor_args = kwargs
self.adapter_class = ColorspaceAdapter if colorspace else Adapter

@staticmethod
def _parse_not_linked(spec, cmdline_type):
"""
Parse an adapter specification for a non-linked adapter (without '...')
Allow:
'back' and ADAPTER
'back' and ADAPTERX
'back' and ADAPTER$
'front' and ADAPTER
'front' and XADAPTER
'front' and ^ADAPTER
'anywhere' and ADAPTER
"""
error = ValueError(
"You cannot use multiple placement restrictions for an adapter at the same time. "
"Choose one of ^ADAPTER, ADAPTER$, XADAPTER or ADAPTERX")

# TODO
# Special case for adapter consisting of only X characters:
# This needs to be supported for backwards-compatibilitity
if len(spec.strip('X')) == 0:
return None, spec, None

front_restriction = None
if spec.startswith('^'):
front_restriction = 'anchored'
spec = spec[1:]
if spec.upper().startswith('X'):
if front_restriction is not None:
raise error
front_restriction = 'noninternal'
spec = spec.lstrip('xX')

back_restriction = None
if spec.endswith('$'):
back_restriction = 'anchored'
spec = spec[:-1]
if spec.upper().endswith('X'):
if back_restriction is not None:
raise error
back_restriction = 'noninternal'
spec = spec.rstrip('xX')

n_placement_restrictions = int(bool(front_restriction)) + int(bool(back_restriction))
if n_placement_restrictions > 1:
raise error

if cmdline_type == 'front' and back_restriction:
raise ValueError(
"Allowed placement restrictions for a 5' adapter are XADAPTER and ^ADAPTER")
if cmdline_type == 'back' and front_restriction:
raise ValueError(
"Allowed placement restrictions for a 3' adapter are ADAPTERX and ADAPTER$")

if cmdline_type == 'anywhere' and n_placement_restrictions > 0:
raise ValueError(
"Placement restrictions (with X, ^, $) not supported for 'anywhere' (-b) adapters")
assert front_restriction is None or back_restriction is None
return front_restriction, spec, back_restriction

def _parse_no_file(self, spec, name=None, cmdline_type='back'):
"""
Parse an adapter specification not using ``file:`` notation and return
Expand All @@ -86,68 +150,72 @@ def _parse_no_file(self, spec, name=None, cmdline_type='back'):
"""
if name is None:
name, spec = self._extract_name(spec)
orig_spec = spec
types = dict(back=BACK, front=FRONT, anywhere=ANYWHERE)
if cmdline_type not in types:
if cmdline_type not in ('front', 'back', 'anywhere'):
raise ValueError('cmdline_type cannot be {0!r}'.format(cmdline_type))
where = types[cmdline_type]

front_anchored, back_anchored = False, False
if spec.startswith('^'):
spec = spec[1:]
front_anchored = True
if spec.endswith('$'):
spec = spec[:-1]
back_anchored = True
spec1, middle, spec2 = spec.partition('...')
del spec

sequence1, middle, sequence2 = spec.partition('...')
if where == ANYWHERE:
if front_anchored or back_anchored:
raise ValueError("'anywhere' (-b) adapters may not be anchored")
if middle == '...':
# Handle linked adapter
if middle == '...' and spec1 and spec2:
if cmdline_type == 'anywhere':
raise ValueError("'anywhere' (-b) adapters may not be linked")
return self.adapter_class(sequence=spec, where=where, name=name, **self.constructor_args)
if self.colorspace:
raise NotImplementedError(
'Using linked adapters in colorspace is not supported')
front1, sequence1, back1 = self._parse_not_linked(spec1, 'front')
front2, sequence2, back2 = self._parse_not_linked(spec2, 'back')

# Automatically anchor the 5' adapter if -a is used
if cmdline_type == 'back' and front1 is None:
front1 = 'anchored'

front_anchored = front1 == 'anchored'
back_anchored = back2 == 'anchored'
require_both = True if not front_anchored and not back_anchored else None
return LinkedAdapter(
sequence1, sequence2, name=name,
front_restriction=front1,
back_restriction=back2,
require_both=require_both,
**self.constructor_args)

assert where == FRONT or where == BACK
if middle == '...':
if not sequence1:
if where == BACK: # -a ...ADAPTER
spec = sequence2
if not spec1:
if cmdline_type == 'back': # -a ...ADAPTER
spec = spec2
else: # -g ...ADAPTER
raise ValueError('Invalid adapter specification')
elif not sequence2:
if where == BACK: # -a ADAPTER...
spec = sequence1
where = FRONT
front_anchored = True
elif not spec2:
if cmdline_type == 'back': # -a ADAPTER...
cmdline_type = 'front'
spec = '^' + spec1
else: # -g ADAPTER...
spec = sequence1
spec = spec1
else:
# linked adapter
if self.colorspace:
raise NotImplementedError(
'Using linked adapters in colorspace is not supported')
# automatically anchor 5' adapter if -a is used
if where == BACK:
front_anchored = True

require_both = True if not front_anchored and not back_anchored else None
return LinkedAdapter(sequence1, sequence2, name=name,
front_anchored=front_anchored, back_anchored=back_anchored,
require_both=require_both,
**self.constructor_args)
if front_anchored and back_anchored:
raise ValueError('Trying to use both "^" and "$" in adapter specification {!r}'.format(orig_spec))
if front_anchored:
if where == BACK:
raise ValueError("Cannot anchor the 3' adapter at its 5' end")
assert False, 'This should not happen'
else:
spec = spec1

front_restriction, sequence, back_restriction = self._parse_not_linked(spec, cmdline_type)
del spec
if front_restriction == 'anchored':
where = PREFIX
elif back_anchored:
if where == FRONT:
raise ValueError("Cannot anchor 5' adapter at 3' end")
elif front_restriction == 'noninternal':
where = FRONT_NOT_INTERNAL
elif back_restriction == 'anchored':
where = SUFFIX
elif back_restriction == 'noninternal':
where = BACK_NOT_INTERNAL
elif cmdline_type == 'front':
where = FRONT
elif cmdline_type == 'back':
where = BACK
else:
assert cmdline_type == 'anywhere'
where = ANYWHERE

return self.adapter_class(sequence=spec, where=where, name=name, **self.constructor_args)
return self.adapter_class(sequence=sequence, where=where, name=name, **self.constructor_args)

def parse(self, spec, cmdline_type='back'):
"""
Expand Down Expand Up @@ -483,7 +551,7 @@ def __init__(self, sequence, where, max_error_rate=0.1, min_overlap=3,
# Optimization: Use non-wildcard matching if only ACGT is used
self.adapter_wildcards = adapter_wildcards and not set(self.sequence) <= set('ACGT')
self.read_wildcards = read_wildcards
self.remove_before = where not in (BACK, SUFFIX)
self.remove_before = where not in (BACK, SUFFIX, BACK_NOT_INTERNAL)

self.aligner = align.Aligner(self.sequence, self.max_error_rate,
flags=self.where, wildcard_ref=self.adapter_wildcards, wildcard_query=self.read_wildcards)
Expand Down Expand Up @@ -526,8 +594,9 @@ def match_to(self, read, match_class=Match):
pos = 0 if read_seq.startswith(self.sequence) else -1
elif self.where == SUFFIX:
pos = (len(read_seq) - len(self.sequence)) if read_seq.endswith(self.sequence) else -1
else:
elif self.where == BACK or self.where == FRONT:
pos = read_seq.find(self.sequence)
# TODO BACK_NOT_INTERNAL, FRONT_NOT_INTERNAL
if pos >= 0:
if self.where == ANYWHERE:
# guess: if alignment starts at pos 0, it’s a 5' adapter
Expand Down Expand Up @@ -686,21 +755,37 @@ def update_statistics(self, statistics):
class LinkedAdapter(object):
"""
"""
def __init__(self, front_sequence, back_sequence,
front_anchored=True, back_anchored=False, require_both=None, name=None, **kwargs):
def __init__(self, front_sequence, back_sequence, front_restriction='anchored',
back_restriction=None, require_both=None, name=None, **kwargs):
"""
require_both -- require both adapters to match. If not specified, the default is to
require only anchored adapters to match.
kwargs are passed on to individual Adapter constructors
"""
where1 = PREFIX if front_anchored else FRONT
where2 = SUFFIX if back_anchored else BACK
if front_restriction is None:
where1 = FRONT
elif front_restriction == 'anchored':
where1 = PREFIX
elif front_restriction == 'noninternal':
where1 = FRONT_NOT_INTERNAL
else:
raise ValueError('Value {0} for front_restriction not allowed'.format(front_restriction))

if back_restriction is None:
where2 = BACK
elif back_restriction == 'anchored':
where2 = SUFFIX
elif back_restriction == 'noninternal':
where2 = BACK_NOT_INTERNAL
else:
raise ValueError(
'Value {0} for back_restriction not allowed'.format(back_restriction))
if require_both:
self._require_back_match = True
self._require_front_match = True
else:
self._require_front_match = front_anchored
self._require_back_match = back_anchored
self._require_front_match = front_restriction == 'anchored'
self._require_back_match = back_restriction == 'anchored'

# The following attributes are needed for the report
self.where = LINKED
Expand Down
12 changes: 8 additions & 4 deletions src/cutadapt/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import sys
from contextlib import contextmanager
import textwrap
from .adapters import BACK, FRONT, PREFIX, SUFFIX, ANYWHERE, LINKED
from .adapters import BACK, BACK_NOT_INTERNAL, FRONT, FRONT_NOT_INTERNAL, PREFIX, SUFFIX, ANYWHERE, LINKED
from .modifiers import QualityTrimmer, NextseqQualityTrimmer, AdapterCutter
from .filters import (NoFilter, PairedNoFilter, TooShortReadFilter, TooLongReadFilter,
DiscardTrimmedFilter, DiscardUntrimmedFilter, PairedEndDemultiplexer, Demultiplexer,
Expand Down Expand Up @@ -159,7 +159,9 @@ def too_many_n_fraction(self):

ADAPTER_TYPES = {
BACK: "regular 3'",
BACK_NOT_INTERNAL: "non-internal 3'",
FRONT: "regular 5'",
FRONT_NOT_INTERNAL: "non-internal 5'",
PREFIX: "anchored 5'",
SUFFIX: "anchored 3'",
ANYWHERE: "variable 5'/3'",
Expand Down Expand Up @@ -310,7 +312,9 @@ def print_report(stats, time, gc_content):
total_back = sum(adapter_statistics.back.lengths.values())
total = total_front + total_back
where = adapter_statistics.where
assert where in (ANYWHERE, LINKED) or (where in (BACK, SUFFIX) and total_front == 0) or (where in (FRONT, PREFIX) and total_back == 0)
assert (where in (ANYWHERE, LINKED)
or (where in (BACK, BACK_NOT_INTERNAL, SUFFIX) and total_front == 0)
or (where in (FRONT, FRONT_NOT_INTERNAL, PREFIX) and total_back == 0)), (where, total_front)

if stats.paired:
extra = 'First read: ' if which_in_pair == 0 else 'Second read: '
Expand Down Expand Up @@ -354,13 +358,13 @@ def print_report(stats, time, gc_content):
print()
print("Overview of removed sequences at 3' end")
print_histogram(adapter_statistics.back, stats.n, gc_content)
elif where in (FRONT, PREFIX):
elif where in (FRONT, PREFIX, FRONT_NOT_INTERNAL):
print()
print_error_ranges(len(adapter_statistics.front.sequence), adapter_statistics.front.max_error_rate)
print("Overview of removed sequences")
print_histogram(adapter_statistics.front, stats.n, gc_content)
else:
assert where in (BACK, SUFFIX)
assert where in (BACK, SUFFIX, BACK_NOT_INTERNAL)
print()
print_error_ranges(len(adapter_statistics.back.sequence), adapter_statistics.back.max_error_rate)
warning = warning or print_adjacent_bases(adapter_statistics.back.adjacent_bases)
Expand Down
14 changes: 14 additions & 0 deletions tests/cut/adapterx.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
>r1
CGTCCGAAGTAGCTAtccgaatagaCCACCCTGATTAGACAAAT
>r2
tccgaatagaAGCCGCTACGACGGGTTGGCCCTTAGACGTATCT
>r3
atagaCAAGATCTACCCTGCCACATTGCCCTAGTTAAAC
>r4
CtccgaatagaAAGATCTACCCTGCCACATTGCCCTAGTTAAAC
>r5
CAAGATCTACCCTGCCACATTGCCCTAGTTAAAC
>r6
CAAGATCTACCCTGCCACATTGCCCTAGTTAAAC
>r5
CAAGATCTACCCTGCCACATTGCCCTAGTT
14 changes: 14 additions & 0 deletions tests/cut/xadapter.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
>r1
CGTCCGAAGTAGCTAtccgaatagaCCACCCTGATTAGACAAAT
>r2
AGCCGCTACGACGGGTTGGCCCTTAGACGTATCT
>r3
CAAGATCTACCCTGCCACATTGCCCTAGTTAAAC
>r4
AAGATCTACCCTGCCACATTGCCCTAGTTAAAC
>r5
CAAGATCTACCCTGCCACATTGCCCTAGTTAAACtccgaataga
>r6
CAAGATCTACCCTGCCACATTGCCCTAGTTAAACtccga
>r5
CAAGATCTACCCTGCCACATTGCCCTAGTTtccgaatagaA
14 changes: 14 additions & 0 deletions tests/data/xadapterx.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
>r1
CGTCCGAAGTAGCTAtccgaatagaCCACCCTGATTAGACAAAT
>r2
tccgaatagaAGCCGCTACGACGGGTTGGCCCTTAGACGTATCT
>r3
atagaCAAGATCTACCCTGCCACATTGCCCTAGTTAAAC
>r4
CtccgaatagaAAGATCTACCCTGCCACATTGCCCTAGTTAAAC
>r5
CAAGATCTACCCTGCCACATTGCCCTAGTTAAACtccgaataga
>r6
CAAGATCTACCCTGCCACATTGCCCTAGTTAAACtccga
>r5
CAAGATCTACCCTGCCACATTGCCCTAGTTtccgaatagaA
Loading

0 comments on commit 40907f8

Please sign in to comment.