"""
Should take in a sam file from a aligner like bwa aln or bwa mem and convert it into a
"""
import itertools
import pysam
import subprocess
import warnings
import os
from copy import copy
from .constants import COLUMNS, SVTYPE, CIGAR, reverse_complement, ORIENT
from .bam import cigar as cigar_tools
from .bam import read as read_tools
from .interval import Interval
from .util import devnull
from .breakpoint import BreakpointPair
from .error import InvalidRearrangement
from vocab import Vocab
SUPPORTED_ALIGNER = Vocab(BWA_MEM='bwa mem', BLAT='blat')
[docs]class SplitAlignment:
def __init__(self, read1, read2=None):
if read2 is not None and any([
read1.reference_name > read2.reference_name,
read1.reference_name == read2.reference_name and read1.reference_start > read2.reference_start
]):
read1, read2 = read2, read1
self.read1 = read1
self.read2 = read2
if self.read2 is not None:
read2_seq = read2.query_sequence if not self.opposing_strands else reverse_complement(read2.query_sequence)
if read1.query_sequence != read2_seq:
raise ValueError('valid split alignments must share the same (or reverse complement) sequence')
self.query_sequence = self.read1.query_sequence
# TODO: if the reads 'share' query sequence overlap, remove this from the second alignment
def __getitem__(self, index):
if isinstance(index, int):
if index == 0:
return self.read1
elif index == 1:
return self.read2
else:
raise IndexError('index out of bounds', index)
else:
raise ValueError('index must be an integer')
@property
def opposing_strands(self):
if self.read2 is None:
return False
else:
return self.read1.is_reverse != self.read2.is_reverse
[docs] def query_coverage_read1(self):
qc1 = query_coverage_interval(self.read1)
return qc1
[docs] def query_coverage_read2(self):
seqlen = len(self.read1.query_sequence)
qc1 = self.query_coverage_read1()
qc2 = qc1
if self.read2 is not None:
qc2 = query_coverage_interval(self.read2)
if self.read2.is_reverse != self.read1.is_reverse:
qc2 = Interval(seqlen - qc2.end, seqlen - qc2.start)
return qc2
[docs] def query_coverage(self):
"""
interval representing the total region of the input sequence that is covered by the combination of alignments
"""
if self.read2 is None:
return self.query_coverage_read1()
else:
return self.query_coverage_read1() | self.query_coverage_read2()
[docs] def query_consumption(self):
"""
fraction of the query sequence which is aligned (everything not soft-clipped) in either alignment
"""
if self.read2 is None or Interval.overlaps(self.query_coverage_read1(), self.query_coverage_read2()):
return len(self.query_coverage()) / len(self.query_sequence)
else:
return (len(self.query_coverage_read1()) + len(self.query_coverage_read2())) / len(self.query_sequence)
[docs] def query_overlap_extension(self):
if self.read2 is not None:
max_init_overlap = max(len(self.query_coverage_read1()), len(self.query_coverage_read2()))
total_overlap = len(self.query_coverage()) - max_init_overlap
return total_overlap
else:
return 0
[docs] def score(self, consec_bonus=10):
def score_matches(cigar):
return sum([v + (v - 1) * consec_bonus for s, v in cigar if s == CIGAR.EQ])
score = score_matches(self.read1.cigar)
qlen = len(self.query_coverage())
if self.read2 is not None:
score += score_matches(self.read2.cigar)
if Interval.overlaps(self.query_coverage_read1(), self.query_coverage_read2()):
qlen += len(self.query_coverage_read1() & self.query_coverage_read2())
return score / (qlen + (qlen - 1) * consec_bonus)
[docs] def select_supporting_alignments(
bpp, alignments,
min_query_consumption,
min_extend_overlap,
max_event_size,
min_anchor_size,
merge_inner_anchor,
merge_outer_anchor
):
"""
give a breakpoint pair and a set of alignments for contigs associated with the given pair,
alignments are paired (some events cannot be represented with a single bamfile alignment)
and the most appropriate alignments supporting the breakpoint pair are selected and returned
"""
# now for each bpp assign an alignment to each contig
putative_alignments = []
putative_event_types = set(bpp.putative_event_types())
if {SVTYPE.INS, SVTYPE.DUP} & putative_event_types:
putative_event_types = putative_event_types | {SVTYPE.INS, SVTYPE.DUP}
# for events on the same template and strand we expect to find a single contig alignment
if not bpp.interchromosomal and not bpp.opposing_strands:
for read in alignments:
try:
aln = SplitAlignment(read)
except ValueError:
continue
# if it covers both breakpoints add to putative alignments
ref_cover = Interval(read.reference_start, read.reference_end - 1)
if all([
aln.read1.reference_name == bpp.break1.chr,
Interval.overlaps(bpp.outer_window1, ref_cover),
Interval.overlaps(bpp.outer_window2, ref_cover)
]):
if aln.query_consumption() < min_query_consumption:
continue
# split the continuous alignment, assume ins/dup or indel
ins = sum([v for c, v in read.cigar if c == CIGAR.I] + [0])
dln = sum([v for c, v in read.cigar if c in [CIGAR.D, CIGAR.N]] + [0])
aln.read1 = copy(read)
aln.read1.cigar = cigar_tools.merge_internal_events(aln.read1.cigar, merge_inner_anchor, merge_outer_anchor)
for event_type in putative_event_types:
if event_type in {SVTYPE.INS, SVTYPE.DUP} and ins > 0 and ins > dln:
putative_alignments.append(aln)
elif event_type in {SVTYPE.DEL, SVTYPE.INV} and dln > 0 and dln > ins:
putative_alignments.append(aln)
# don't use reads in combined alignments if they have already been assigned in a single alignment
combo_prohibited = [x for x, y in putative_alignments]
for read1, read2 in itertools.combinations([x for x in alignments if x not in combo_prohibited], 2):
# do they overlap both breakpoints
if read2 is not None and any([
read1.reference_name > read2.reference_name,
read1.reference_name == read2.reference_name and read1.reference_start > read2.reference_start
]):
read1, read2 = read2, read1
if read1.reference_name != bpp.break1.chr or read2.reference_name != bpp.break2.chr:
continue
read1 = read_tools.convert_events_to_softclipping(
read1, bpp.break1.orient, max_event_size=max_event_size, min_anchor_size=min_anchor_size)
read2 = read_tools.convert_events_to_softclipping(
read2, bpp.break2.orient, max_event_size=max_event_size, min_anchor_size=min_anchor_size)
try:
aln = SplitAlignment(read1, read2)
except ValueError:
continue
# check that the coverage intervals overlap the event windows
if any([
not Interval.overlaps((aln.read1.reference_start + 1, aln.read1.reference_end), bpp.outer_window1),
not Interval.overlaps((aln.read2.reference_start + 1, aln.read2.reference_end), bpp.outer_window2)
]):
continue
# reads should have unique reference overlap
if not bpp.interchromosomal and aln.read1.reference_end > aln.read2.reference_end:
continue
# check that the combination extends the amount of the initial query sequence we consume
qc = len(aln.query_coverage())
if any([
len(aln.query_coverage_read1()) >= qc or len(aln.query_coverage_read2()) >= qc,
aln.query_consumption() < min_query_consumption,
aln.read2 is not None and aln.query_overlap_extension() < min_extend_overlap
]):
continue
try:
call = BreakpointPair.call_breakpoint_pair(read1, read2)
if not set(BreakpointPair.classify(call)) & putative_event_types:
continue
except (InvalidRearrangement, AssertionError):
continue
putative_alignments.append(aln)
return putative_alignments
[docs] @staticmethod
def breakpoint_contig_remapped_depth(breakpoint, contig, read):
if breakpoint.chr != read.reference_name:
raise AssertionError('breakpoint chromosome does not match read reference', breakpoint, read.reference_name)
if len(breakpoint) > 1:
raise NotImplementedError('only applies to exact breakpoint calls')
# get the reference positions for each breakpoint interval from the breakpointpair
# convert this to the query intervals using the alignment
# for each query interval calculate the read coverage as a pileup over the distance
s = read.reference_start + 1
t = read.reference_end
if breakpoint.orient == ORIENT.LEFT:
if breakpoint.start < s:
return 0
t = min(breakpoint.start, t)
elif breakpoint.orient == ORIENT.RIGHT:
if breakpoint.start > t:
return 0
s = max(s, breakpoint.start)
qrange = read_tools.map_ref_range_to_query_range(read, Interval(s, t))
return contig.remap_depth(qrange)
[docs]def query_coverage_interval(read):
"""
Returns:
:class:`~mavis.interval.Interval`: The portion of the original query sequence that is aligned by this read
"""
seq = read.query_sequence
s = 0
t = len(seq) - 1
if read.cigar[0][0] == CIGAR.S:
s += read.cigar[0][1]
if read.cigar[-1][0] == CIGAR.S:
t -= read.cigar[-1][1]
return Interval(s, t)
[docs]def align_contigs(
evidence,
INPUT_BAM_CACHE,
reference_genome,
aligner,
aligner_reference,
aligner_output_file='aligner_out.temp',
aligner_fa_input_file='aligner_in.fa',
blat_min_identity=0.7,
contig_aln_min_query_consumption=0.5,
contig_aln_max_event_size=50,
contig_aln_min_anchor_size=50,
contig_aln_merge_inner_anchor=20,
contig_aln_merge_outer_anchor=20,
blat_limit_top_aln=25,
is_protein=False,
min_extend_overlap=10,
clean_files=True,
log=devnull,
**kwargs):
"""
given a set of contigs, call the aligner from the command line and adds the results to the contigs
associated with each Evidence object
"""
if is_protein:
raise NotImplementedError('currently does not support aligning protein sequences')
try:
# write the input sequences to a fasta file
query_id_mapping = {}
sequences = set()
count = 1
ev_by_seq = {}
for e in evidence:
for c in e.contigs:
sequences.add(c.seq)
ev_by_seq.setdefault(c.seq, []).append(e.data.get(COLUMNS.cluster_id, None))
with open(aligner_fa_input_file, 'w') as fh:
for seq in sequences:
n = 'seq{}'.format(count)
log(n, [x for x in ev_by_seq[seq] if x is not None])
query_id_mapping[n] = seq
fh.write('>' + n + '\n' + seq + '\n')
count += 1
if len(sequences) == 0:
return
log('will use', aligner, 'to align', len(sequences), 'unique sequences', time_stamp=False)
# call the aligner using subprocess
if aligner == SUPPORTED_ALIGNER.BLAT:
from .blat import process_blat_output
# call the aligner using subprocess
blat_min_identity *= 100
blat_options = kwargs.pop(
'blat_options', ["-stepSize=5", "-repMatch=2253", "-minScore=0", "-minIdentity={0}".format(blat_min_identity)])
# call the blat subprocess
# will raise subprocess.CalledProcessError if non-zero exit status
# parameters from https://genome.ucsc.edu/FAQ/FAQblat.html#blat4
log([SUPPORTED_ALIGNER.BLAT, aligner_reference,
aligner_fa_input_file, aligner_output_file, '-out=pslx', '-noHead'] + blat_options)
subprocess.check_output([
SUPPORTED_ALIGNER.BLAT, aligner_reference,
aligner_fa_input_file, aligner_output_file, '-out=pslx', '-noHead'] + blat_options)
reads_by_query = process_blat_output(
INPUT_BAM_CACHE=INPUT_BAM_CACHE,
query_id_mapping=query_id_mapping,
reference_genome=reference_genome,
aligner_output_file=aligner_output_file,
blat_limit_top_aln=blat_limit_top_aln,
is_protein=is_protein
)
elif aligner == SUPPORTED_ALIGNER.BWA_MEM:
command = '{} {} {} -Y'.format(aligner, aligner_reference, aligner_fa_input_file)
log(command) # for bwa
with open(aligner_output_file, 'w') as f:
subprocess.call(command, stdout=f, shell=True)
with pysam.AlignmentFile(aligner_output_file, 'r') as samfile:
reads_by_query = {}
for read in samfile.fetch():
read = read_tools.SamRead.copy(read)
read.reference_id = INPUT_BAM_CACHE.reference_id(read.reference_name)
if read.is_paired:
read.next_reference_id = INPUT_BAM_CACHE.reference_id(read.next_reference_name)
read.cigar = cigar_tools.recompute_cigar_mismatch(read, reference_genome[read.reference_name])
query_seq = query_id_mapping[read.query_name]
reads_by_query.setdefault(query_seq, []).append(read)
else:
raise NotImplementedError('unsupported aligner', aligner)
for e in evidence:
for contig in e.contigs:
aln = reads_by_query.get(contig.seq, [])
putative_alignments = SplitAlignment.select_supporting_alignments(
e, aln,
min_extend_overlap=min_extend_overlap,
min_query_consumption=contig_aln_min_query_consumption,
min_anchor_size=contig_aln_min_anchor_size,
max_event_size=contig_aln_max_event_size,
merge_inner_anchor=contig_aln_merge_inner_anchor,
merge_outer_anchor=contig_aln_merge_outer_anchor
)
contig.alignments.extend(putative_alignments)
finally:
# clean up
if clean_files:
for f in [aligner_output_file, aligner_fa_input_file]:
if os.path.exists(f):
try:
os.remove(f)
except OSError as e:
warnings.warn(repr(e))