>temp_0|source:-*- coding: utf-8 -*-|start:5631|stop:6131
t.grid(False)plt.yticks([])plt.xticks([])plt.text(0.02, 0.5, 'total: %s / mean: 
%.2f / std: %.2f / min: %s / max: %s'\% (len(sequence_lengths),numpy.mean(sequen
ce_lengths), numpy.std(sequence_lengths),\min(sequence_lengths),\max(sequence_le
ngths)),\va = 'center', alpha = 0.8, size = 'x-large')if dest == None:dest = sel
f.fasta_file_pathtry:plt.savefig(dest + '.pdf')except:plt.savefig(dest + '.png')
try:plt.show()except:passreturnclass QualSource:def __init__(self, quals_file_pa
th, lazy_init = True
>temp_1|source:-*- coding: utf-8 -*-|start:1883|stop:2383
sh_dict = {}self.unique_hash_list = []self.unique_next_hash = 0self.file_pointer
 = open(self.fasta_file_path)self.file_pointer.seek(0)if self.lazy_init:self.tot
al_seq = Noneelse:self.total_seq = len([l for l in self.file_pointer.readlines()
 if l.startswith('>')])self.reset()if self.unique:self.init_unique_hash()def ini
t_unique_hash(self):while self.next_regular():hash = hashlib.sha1(self.seq.upper
()).hexdigest()if hash in self.unique_hash_dict:self.unique_hash_dict[hash]['ids
'].append(self.id)se
>temp_2|source:-*- coding: utf-8 -*-|start:4891|stop:5391
 y2 = 0, color = 'black', alpha = 0.15)plt.ylabel('number of sequences')plt.xlab
el('sequence length')if xtickstep == None:xtickstep = (max_seq_len / 50) or 1if 
ytickstep == None:ytickstep = max(seq_len_distribution) / 20 or 1plt.xticks(rang
e(xtickstep, max_seq_len + 1, xtickstep), rotation=90, size='xx-small')plt.ytick
s(range(0, max(seq_len_distribution) + 1, ytickstep),[y for y in range(0, max(se
q_len_distribution) + 1, ytickstep)],size='xx-small')plt.xlim(xmin = 0, xmax = m
ax_seq_len)plt.ylim(
>temp_3|source:-*- coding: utf-8 -*-|start:2206|stop:2706
nique_hash(self):while self.next_regular():hash = hashlib.sha1(self.seq.upper())
.hexdigest()if hash in self.unique_hash_dict:self.unique_hash_dict[hash]['ids'].
append(self.id)self.unique_hash_dict[hash]['count'] += 1else:self.unique_hash_di
ct[hash] = {'id' : self.id,'ids': [self.id],'seq': self.seq,'count': 1}self.uniq
ue_hash_list = [i[1] for i in sorted([(self.unique_hash_dict[hash]['count'], has
h)\for hash in self.unique_hash_dict], reverse = True)]self.total_unique = len(s
elf.unique_hash_dict
>temp_4|source:-*- coding: utf-8 -*-|start:207|stop:707
oundation; either version 2 of the License, or (at your option)# any later versi
on.## Please read the docs/COPYING file.import sysimport numpyimport hashlibclas
s FastaOutput:def __init__(self, output_file_path):self.output_file_path = outpu
t_file_pathself.output_file_obj = open(output_file_path, 'w')def store(self, ent
ry, split = True, store_frequencies = True):if entry.unique and store_frequencie
s:self.write_id('%s|%s' % (entry.id, 'frequency:%d' % len(entry.ids)))else:self.
write_id(entry.id)se
>temp_5|source:-*- coding: utf-8 -*-|start:2909|stop:3409
y = self.unique_hash_dict[self.unique_hash_list[self.pos]]self.pos += 1self.seq 
= hash_entry['seq'] if self.allow_mixed_case else hash_entry['seq'].upper()self.
id  = hash_entry['id']self.ids = hash_entry['ids']return Trueelse:return Falseel
se:return Falsedef next_regular(self):self.seq = Noneself.id = self.file_pointer
.readline()[1:].strip()sequence = ''while 1:line = self.file_pointer.readline()i
f not line:if len(sequence):self.seq = sequenceself.pos += 1return Trueelse:retu
rn Falseif line.star
>temp_6|source:-*- coding: utf-8 -*-|start:1725|stop:2225
oneself.lazy_init = lazy_initself.allow_mixed_case = allow_mixed_caseself.pos = 
0self.id  = Noneself.seq = Noneself.ids = []self.unique = uniqueself.unique_hash
_dict = {}self.unique_hash_list = []self.unique_next_hash = 0self.file_pointer =
 open(self.fasta_file_path)self.file_pointer.seek(0)if self.lazy_init:self.total
_seq = Noneelse:self.total_seq = len([l for l in self.file_pointer.readlines() i
f l.startswith('>')])self.reset()if self.unique:self.init_unique_hash()def init_
unique_hash(self):wh
>temp_7|source:-*- coding: utf-8 -*-|start:92|stop:592
tribute it and/or modify it under# the terms of the GNU General Public License a
s published by the Free# Software Foundation; either version 2 of the License, o
r (at your option)# any later version.## Please read the docs/COPYING file.impor
t sysimport numpyimport hashlibclass FastaOutput:def __init__(self, output_file_
path):self.output_file_path = output_file_pathself.output_file_obj = open(output
_file_path, 'w')def store(self, entry, split = True, store_frequencies = True):i
f entry.unique and s
>temp_8|source:-*- coding: utf-8 -*-|start:511|stop:1011
 store(self, entry, split = True, store_frequencies = True):if entry.unique and 
store_frequencies:self.write_id('%s|%s' % (entry.id, 'frequency:%d' % len(entry.
ids)))else:self.write_id(entry.id)self.write_seq(entry.seq, split)def write_id(s
elf, id):self.output_file_obj.write('>%s\n' % id)def write_seq(self, seq, split 
= True):if split:seq = self.split(seq)self.output_file_obj.write('%s\n' % seq)de
f split(self, sequence, piece_length = 80):ticks = range(0, len(sequence), piece
_length) + [len(sequ
>temp_9|source:-*- coding: utf-8 -*-|start:2963|stop:3463
os]]self.pos += 1self.seq = hash_entry['seq'] if self.allow_mixed_case else hash
_entry['seq'].upper()self.id  = hash_entry['id']self.ids = hash_entry['ids']retu
rn Trueelse:return Falseelse:return Falsedef next_regular(self):self.seq = Nones
elf.id = self.file_pointer.readline()[1:].strip()sequence = ''while 1:line = sel
f.file_pointer.readline()if not line:if len(sequence):self.seq = sequenceself.po
s += 1return Trueelse:return Falseif line.startswith('>'):self.file_pointer.seek
(self.file_pointer.t
>temp_10|source:-*- coding: utf-8 -*-|start:537|stop:1037
= True, store_frequencies = True):if entry.unique and store_frequencies:self.wri
te_id('%s|%s' % (entry.id, 'frequency:%d' % len(entry.ids)))else:self.write_id(e
ntry.id)self.write_seq(entry.seq, split)def write_id(self, id):self.output_file_
obj.write('>%s\n' % id)def write_seq(self, seq, split = True):if split:seq = sel
f.split(seq)self.output_file_obj.write('%s\n' % seq)def split(self, sequence, pi
ece_length = 80):ticks = range(0, len(sequence), piece_length) + [len(sequence)]
return '\n'.join([se
>temp_11|source:-*- coding: utf-8 -*-|start:6567|stop:7067
rip()self.quals = Noneself.quals_int = Nonequalscores = ''while 1:line = self.fi
le_pointer.readline()if not line:if len(qualscores):self.quals = qualscores.stri
p()self.quals_int = [int(q) for q in self.quals.split()]self.pos += 1return True
else:return Falseif line.startswith('>'):self.file_pointer.seek(self.file_pointe
r.tell() - len(line))breakqualscores += ' ' + line.strip()self.quals = qualscore
s.strip()self.quals_int = [int(q) for q in self.quals.split()]self.pos += 1retur
n Truedef close(self
>temp_12|source:-*- coding: utf-8 -*-|start:119|stop:619
 under# the terms of the GNU General Public License as published by the Free# So
ftware Foundation; either version 2 of the License, or (at your option)# any lat
er version.## Please read the docs/COPYING file.import sysimport numpyimport has
hlibclass FastaOutput:def __init__(self, output_file_path):self.output_file_path
 = output_file_pathself.output_file_obj = open(output_file_path, 'w')def store(s
elf, entry, split = True, store_frequencies = True):if entry.unique and store_fr
equencies:self.write
>temp_13|source:-*- coding: utf-8 -*-|start:1068|stop:1568
or x in range(0, len(ticks) - 1)])def close(self):self.output_file_obj.close()cl
ass ReadFasta:def __init__(self, f_name):self.ids = []self.sequences = []self.fa
sta = SequenceSource(f_name)while self.fasta.next():if self.fasta.pos % 1000 == 
0 or self.fasta.pos == 1:sys.stderr.write('\r[fastalib] Reading FASTA into memor
y: %s' % (self.fasta.pos))sys.stderr.flush()self.ids.append(self.fasta.id)self.s
equences.append(self.fasta.seq)sys.stderr.write('\n')def close(self):self.fasta.
close()class Sequenc
>temp_14|source:-*- coding: utf-8 -*-|start:30|stop:530
, A. Murat Eren## This program is free software; you can redistribute it and/or 
modify it under# the terms of the GNU General Public License as published by the
 Free# Software Foundation; either version 2 of the License, or (at your option)
# any later version.## Please read the docs/COPYING file.import sysimport numpyi
mport hashlibclass FastaOutput:def __init__(self, output_file_path):self.output_
file_path = output_file_pathself.output_file_obj = open(output_file_path, 'w')de
f store(self, entry,
>temp_15|source:-*- coding: utf-8 -*-|start:5770|stop:6270
hs),numpy.mean(sequence_lengths), numpy.std(sequence_lengths),\min(sequence_leng
ths),\max(sequence_lengths)),\va = 'center', alpha = 0.8, size = 'x-large')if de
st == None:dest = self.fasta_file_pathtry:plt.savefig(dest + '.pdf')except:plt.s
avefig(dest + '.png')try:plt.show()except:passreturnclass QualSource:def __init_
_(self, quals_file_path, lazy_init = True):self.quals_file_path = quals_file_pat
hself.name = Noneself.lazy_init = lazy_initself.pos = 0self.id  = Noneself.quals
 = Noneself.quals_in
>temp_16|source:-*- coding: utf-8 -*-|start:2308|stop:2808
 self.unique_hash_dict:self.unique_hash_dict[hash]['ids'].append(self.id)self.un
ique_hash_dict[hash]['count'] += 1else:self.unique_hash_dict[hash] = {'id' : sel
f.id,'ids': [self.id],'seq': self.seq,'count': 1}self.unique_hash_list = [i[1] f
or i in sorted([(self.unique_hash_dict[hash]['count'], hash)\for hash in self.un
ique_hash_dict], reverse = True)]self.total_unique = len(self.unique_hash_dict)s
elf.reset()def next(self):if self.unique:return self.next_unique()else:return se
lf.next_regular()def
>temp_17|source:-*- coding: utf-8 -*-|start:857|stop:1357
f.split(seq)self.output_file_obj.write('%s\n' % seq)def split(self, sequence, pi
ece_length = 80):ticks = range(0, len(sequence), piece_length) + [len(sequence)]
return '\n'.join([sequence[ticks[x]:ticks[x + 1]] for x in range(0, len(ticks) -
 1)])def close(self):self.output_file_obj.close()class ReadFasta:def __init__(se
lf, f_name):self.ids = []self.sequences = []self.fasta = SequenceSource(f_name)w
hile self.fasta.next():if self.fasta.pos % 1000 == 0 or self.fasta.pos == 1:sys.
stderr.write('\r[fas
>temp_18|source:-*- coding: utf-8 -*-|start:3958|stop:4458
e, ytickstep = None):import matplotlib.pyplot as pltimport matplotlib.gridspec a
s gridspecsequence_lengths = []self.reset()while self.next():if self.pos % 10000
 == 0 or self.pos == 1:sys.stderr.write('\r[fastalib] Reading: %s' % (self.pos))
sys.stderr.flush()sequence_lengths.append(len(self.seq))self.reset()sys.stderr.w
rite('\n')if not max_seq_len:max_seq_len = max(sequence_lengths) + (int(max(sequ
ence_lengths) / 100.0) or 10)seq_len_distribution = [0] * (max_seq_len + 1)for l
 in sequence_lengths
>temp_19|source:-*- coding: utf-8 -*-|start:3625|stop:4125
lf, read_id):self.reset()while self.next():if self.id == read_id:return self.seq
return Falsedef close(self):self.file_pointer.close()def reset(self):self.pos = 
0self.id  = Noneself.seq = Noneself.ids = []self.file_pointer.seek(0)def visuali
ze_sequence_length_distribution(self, title, dest = None, max_seq_len = None, xt
ickstep = None, ytickstep = None):import matplotlib.pyplot as pltimport matplotl
ib.gridspec as gridspecsequence_lengths = []self.reset()while self.next():if sel
f.pos % 10000 == 0 o
>temp_20|source:-*- coding: utf-8 -*-|start:2441|stop:2941
sh_dict[hash] = {'id' : self.id,'ids': [self.id],'seq': self.seq,'count': 1}self
.unique_hash_list = [i[1] for i in sorted([(self.unique_hash_dict[hash]['count']
, hash)\for hash in self.unique_hash_dict], reverse = True)]self.total_unique = 
len(self.unique_hash_dict)self.reset()def next(self):if self.unique:return self.
next_unique()else:return self.next_regular()def next_unique(self):if self.unique
:if self.total_unique > 0 and self.pos < self.total_unique:hash_entry = self.uni
que_hash_dict[self.u
>temp_21|source:-*- coding: utf-8 -*-|start:4742|stop:5242
op = 0.95, right = 0.98)plt.plot(seq_len_distribution, color = 'black', alpha = 
0.3)plt.fill_between(range(0, max_seq_len + 1), seq_len_distribution, y2 = 0, co
lor = 'black', alpha = 0.15)plt.ylabel('number of sequences')plt.xlabel('sequenc
e length')if xtickstep == None:xtickstep = (max_seq_len / 50) or 1if ytickstep =
= None:ytickstep = max(seq_len_distribution) / 20 or 1plt.xticks(range(xtickstep
, max_seq_len + 1, xtickstep), rotation=90, size='xx-small')plt.yticks(range(0, 
max(seq_len_distribu
>temp_22|source:-*- coding: utf-8 -*-|start:2513|stop:3013
: 1}self.unique_hash_list = [i[1] for i in sorted([(self.unique_hash_dict[hash][
'count'], hash)\for hash in self.unique_hash_dict], reverse = True)]self.total_u
nique = len(self.unique_hash_dict)self.reset()def next(self):if self.unique:retu
rn self.next_unique()else:return self.next_regular()def next_unique(self):if sel
f.unique:if self.total_unique > 0 and self.pos < self.total_unique:hash_entry = 
self.unique_hash_dict[self.unique_hash_list[self.pos]]self.pos += 1self.seq = ha
sh_entry['seq'] if s
>temp_23|source:-*- coding: utf-8 -*-|start:2346|stop:2846
h_dict[hash]['ids'].append(self.id)self.unique_hash_dict[hash]['count'] += 1else
:self.unique_hash_dict[hash] = {'id' : self.id,'ids': [self.id],'seq': self.seq,
'count': 1}self.unique_hash_list = [i[1] for i in sorted([(self.unique_hash_dict
[hash]['count'], hash)\for hash in self.unique_hash_dict], reverse = True)]self.
total_unique = len(self.unique_hash_dict)self.reset()def next(self):if self.uniq
ue:return self.next_unique()else:return self.next_regular()def next_unique(self)
:if self.unique:if s
>temp_24|source:-*- coding: utf-8 -*-|start:3861|stop:4361
ualize_sequence_length_distribution(self, title, dest = None, max_seq_len = None
, xtickstep = None, ytickstep = None):import matplotlib.pyplot as pltimport matp
lotlib.gridspec as gridspecsequence_lengths = []self.reset()while self.next():if
 self.pos % 10000 == 0 or self.pos == 1:sys.stderr.write('\r[fastalib] Reading: 
%s' % (self.pos))sys.stderr.flush()sequence_lengths.append(len(self.seq))self.re
set()sys.stderr.write('\n')if not max_seq_len:max_seq_len = max(sequence_lengths
) + (int(max(sequenc
>temp_25|source:-*- coding: utf-8 -*-|start:6453|stop:6953
pointer.readlines() if l.startswith('>')])self.reset()def next(self):self.id = s
elf.file_pointer.readline()[1:].strip()self.quals = Noneself.quals_int = Nonequa
lscores = ''while 1:line = self.file_pointer.readline()if not line:if len(qualsc
ores):self.quals = qualscores.strip()self.quals_int = [int(q) for q in self.qual
s.split()]self.pos += 1return Trueelse:return Falseif line.startswith('>'):self.
file_pointer.seek(self.file_pointer.tell() - len(line))breakqualscores += ' ' + 
line.strip()self.qua
>temp_26|source:-*- coding: utf-8 -*-|start:1578|stop:2078
def __init__(self, fasta_file_path, lazy_init = True, unique = False, allow_mixe
d_case = False):self.fasta_file_path = fasta_file_pathself.name = Noneself.lazy_
init = lazy_initself.allow_mixed_case = allow_mixed_caseself.pos = 0self.id  = N
oneself.seq = Noneself.ids = []self.unique = uniqueself.unique_hash_dict = {}sel
f.unique_hash_list = []self.unique_next_hash = 0self.file_pointer = open(self.fa
sta_file_path)self.file_pointer.seek(0)if self.lazy_init:self.total_seq = Noneel
se:self.total_seq = 
>temp_27|source:-*- coding: utf-8 -*-|start:1611|stop:2111
h, lazy_init = True, unique = False, allow_mixed_case = False):self.fasta_file_p
ath = fasta_file_pathself.name = Noneself.lazy_init = lazy_initself.allow_mixed_
case = allow_mixed_caseself.pos = 0self.id  = Noneself.seq = Noneself.ids = []se
lf.unique = uniqueself.unique_hash_dict = {}self.unique_hash_list = []self.uniqu
e_next_hash = 0self.file_pointer = open(self.fasta_file_path)self.file_pointer.s
eek(0)if self.lazy_init:self.total_seq = Noneelse:self.total_seq = len([l for l 
in self.file_pointer
>temp_28|source:-*- coding: utf-8 -*-|start:5753|stop:6253
en(sequence_lengths),numpy.mean(sequence_lengths), numpy.std(sequence_lengths),\
min(sequence_lengths),\max(sequence_lengths)),\va = 'center', alpha = 0.8, size 
= 'x-large')if dest == None:dest = self.fasta_file_pathtry:plt.savefig(dest + '.
pdf')except:plt.savefig(dest + '.png')try:plt.show()except:passreturnclass QualS
ource:def __init__(self, quals_file_path, lazy_init = True):self.quals_file_path
 = quals_file_pathself.name = Noneself.lazy_init = lazy_initself.pos = 0self.id 
 = Noneself.quals = 
>temp_29|source:-*- coding: utf-8 -*-|start:1891|stop:2391
= {}self.unique_hash_list = []self.unique_next_hash = 0self.file_pointer = open(
self.fasta_file_path)self.file_pointer.seek(0)if self.lazy_init:self.total_seq =
 Noneelse:self.total_seq = len([l for l in self.file_pointer.readlines() if l.st
artswith('>')])self.reset()if self.unique:self.init_unique_hash()def init_unique
_hash(self):while self.next_regular():hash = hashlib.sha1(self.seq.upper()).hexd
igest()if hash in self.unique_hash_dict:self.unique_hash_dict[hash]['ids'].appen
d(self.id)self.uniqu
>temp_30|source:-*- coding: utf-8 -*-|start:6078|stop:6578
:def __init__(self, quals_file_path, lazy_init = True):self.quals_file_path = qu
als_file_pathself.name = Noneself.lazy_init = lazy_initself.pos = 0self.id  = No
neself.quals = Noneself.quals_int = Noneself.ids = []self.file_pointer = open(se
lf.quals_file_path)self.file_pointer.seek(0)if self.lazy_init:self.total_quals =
 Noneelse:self.total_quals = len([l for l in self.file_pointer.readlines() if l.
startswith('>')])self.reset()def next(self):self.id = self.file_pointer.readline
()[1:].strip()self.q
>temp_31|source:-*- coding: utf-8 -*-|start:4425|stop:4925
len + 1)for l in sequence_lengths:seq_len_distribution[l] += 1fig = plt.figure(f
igsize = (16, 12))plt.rcParams.update({'axes.linewidth' : 0.9})plt.rc('grid', co
lor='0.50', linestyle='-', linewidth=0.1)gs = gridspec.GridSpec(10, 1)ax1 = plt.
subplot(gs[0:8])plt.grid(True)plt.subplots_adjust(left=0.05, bottom = 0.03, top 
= 0.95, right = 0.98)plt.plot(seq_len_distribution, color = 'black', alpha = 0.3
)plt.fill_between(range(0, max_seq_len + 1), seq_len_distribution, y2 = 0, color
 = 'black', alpha = 
>temp_32|source:-*- coding: utf-8 -*-|start:2411|stop:2911
ount'] += 1else:self.unique_hash_dict[hash] = {'id' : self.id,'ids': [self.id],'
seq': self.seq,'count': 1}self.unique_hash_list = [i[1] for i in sorted([(self.u
nique_hash_dict[hash]['count'], hash)\for hash in self.unique_hash_dict], revers
e = True)]self.total_unique = len(self.unique_hash_dict)self.reset()def next(sel
f):if self.unique:return self.next_unique()else:return self.next_regular()def ne
xt_unique(self):if self.unique:if self.total_unique > 0 and self.pos < self.tota
l_unique:hash_entry 
>temp_33|source:-*- coding: utf-8 -*-|start:5878|stop:6378
,\va = 'center', alpha = 0.8, size = 'x-large')if dest == None:dest = self.fasta
_file_pathtry:plt.savefig(dest + '.pdf')except:plt.savefig(dest + '.png')try:plt
.show()except:passreturnclass QualSource:def __init__(self, quals_file_path, laz
y_init = True):self.quals_file_path = quals_file_pathself.name = Noneself.lazy_i
nit = lazy_initself.pos = 0self.id  = Noneself.quals = Noneself.quals_int = None
self.ids = []self.file_pointer = open(self.quals_file_path)self.file_pointer.see
k(0)if self.lazy_ini
>temp_34|source:-*- coding: utf-8 -*-|start:2345|stop:2845
sh_dict[hash]['ids'].append(self.id)self.unique_hash_dict[hash]['count'] += 1els
e:self.unique_hash_dict[hash] = {'id' : self.id,'ids': [self.id],'seq': self.seq
,'count': 1}self.unique_hash_list = [i[1] for i in sorted([(self.unique_hash_dic
t[hash]['count'], hash)\for hash in self.unique_hash_dict], reverse = True)]self
.total_unique = len(self.unique_hash_dict)self.reset()def next(self):if self.uni
que:return self.next_unique()else:return self.next_regular()def next_unique(self
):if self.unique:if 
>temp_35|source:-*- coding: utf-8 -*-|start:3443|stop:3943
(self.file_pointer.tell() - len(line))breaksequence += line.strip()self.seq = se
quence if self.allow_mixed_case else sequence.upper()self.pos += 1return Truedef
 get_seq_by_read_id(self, read_id):self.reset()while self.next():if self.id == r
ead_id:return self.seqreturn Falsedef close(self):self.file_pointer.close()def r
eset(self):self.pos = 0self.id  = Noneself.seq = Noneself.ids = []self.file_poin
ter.seek(0)def visualize_sequence_length_distribution(self, title, dest = None, 
max_seq_len = None, 
>temp_36|source:-*- coding: utf-8 -*-|start:6238|stop:6738
neself.quals = Noneself.quals_int = Noneself.ids = []self.file_pointer = open(se
lf.quals_file_path)self.file_pointer.seek(0)if self.lazy_init:self.total_quals =
 Noneelse:self.total_quals = len([l for l in self.file_pointer.readlines() if l.
startswith('>')])self.reset()def next(self):self.id = self.file_pointer.readline
()[1:].strip()self.quals = Noneself.quals_int = Nonequalscores = ''while 1:line 
= self.file_pointer.readline()if not line:if len(qualscores):self.quals = qualsc
ores.strip()self.qua
>temp_37|source:-*- coding: utf-8 -*-|start:6202|stop:6702
= lazy_initself.pos = 0self.id  = Noneself.quals = Noneself.quals_int = Noneself
.ids = []self.file_pointer = open(self.quals_file_path)self.file_pointer.seek(0)
if self.lazy_init:self.total_quals = Noneelse:self.total_quals = len([l for l in
 self.file_pointer.readlines() if l.startswith('>')])self.reset()def next(self):
self.id = self.file_pointer.readline()[1:].strip()self.quals = Noneself.quals_in
t = Nonequalscores = ''while 1:line = self.file_pointer.readline()if not line:if
 len(qualscores):sel
>temp_38|source:-*- coding: utf-8 -*-|start:4577|stop:5077
rid', color='0.50', linestyle='-', linewidth=0.1)gs = gridspec.GridSpec(10, 1)ax
1 = plt.subplot(gs[0:8])plt.grid(True)plt.subplots_adjust(left=0.05, bottom = 0.
03, top = 0.95, right = 0.98)plt.plot(seq_len_distribution, color = 'black', alp
ha = 0.3)plt.fill_between(range(0, max_seq_len + 1), seq_len_distribution, y2 = 
0, color = 'black', alpha = 0.15)plt.ylabel('number of sequences')plt.xlabel('se
quence length')if xtickstep == None:xtickstep = (max_seq_len / 50) or 1if yticks
tep == None:ytickste
>temp_39|source:-*- coding: utf-8 -*-|start:5583|stop:6083
])plt.rcParams.update({'axes.edgecolor' : 20})plt.grid(False)plt.yticks([])plt.x
ticks([])plt.text(0.02, 0.5, 'total: %s / mean: %.2f / std: %.2f / min: %s / max
: %s'\% (len(sequence_lengths),numpy.mean(sequence_lengths), numpy.std(sequence_
lengths),\min(sequence_lengths),\max(sequence_lengths)),\va = 'center', alpha = 
0.8, size = 'x-large')if dest == None:dest = self.fasta_file_pathtry:plt.savefig
(dest + '.pdf')except:plt.savefig(dest + '.png')try:plt.show()except:passreturnc
lass QualSource:def 
>temp_40|source:-*- coding: utf-8 -*-|start:5203|stop:5703
lt.yticks(range(0, max(seq_len_distribution) + 1, ytickstep),[y for y in range(0
, max(seq_len_distribution) + 1, ytickstep)],size='xx-small')plt.xlim(xmin = 0, 
xmax = max_seq_len)plt.ylim(ymin = 0, ymax = max(seq_len_distribution) + (max(se
q_len_distribution) / 20.0))plt.figtext(0.5, 0.96, '%s' % (title), weight = 'bla
ck', size = 'xx-large', ha = 'center')ax1 = plt.subplot(gs[9])plt.rcParams.updat
e({'axes.edgecolor' : 20})plt.grid(False)plt.yticks([])plt.xticks([])plt.text(0.
02, 0.5, 'total: %s 
>temp_41|source:-*- coding: utf-8 -*-|start:3542|stop:4042
w_mixed_case else sequence.upper()self.pos += 1return Truedef get_seq_by_read_id
(self, read_id):self.reset()while self.next():if self.id == read_id:return self.
seqreturn Falsedef close(self):self.file_pointer.close()def reset(self):self.pos
 = 0self.id  = Noneself.seq = Noneself.ids = []self.file_pointer.seek(0)def visu
alize_sequence_length_distribution(self, title, dest = None, max_seq_len = None,
 xtickstep = None, ytickstep = None):import matplotlib.pyplot as pltimport matpl
otlib.gridspec as gr
>temp_42|source:-*- coding: utf-8 -*-|start:3394|stop:3894
lseif line.startswith('>'):self.file_pointer.seek(self.file_pointer.tell() - len
(line))breaksequence += line.strip()self.seq = sequence if self.allow_mixed_case
 else sequence.upper()self.pos += 1return Truedef get_seq_by_read_id(self, read_
id):self.reset()while self.next():if self.id == read_id:return self.seqreturn Fa
lsedef close(self):self.file_pointer.close()def reset(self):self.pos = 0self.id 
 = Noneself.seq = Noneself.ids = []self.file_pointer.seek(0)def visualize_sequen
ce_length_distributi
>temp_43|source:-*- coding: utf-8 -*-|start:5948|stop:6448
self.fasta_file_pathtry:plt.savefig(dest + '.pdf')except:plt.savefig(dest + '.pn
g')try:plt.show()except:passreturnclass QualSource:def __init__(self, quals_file
_path, lazy_init = True):self.quals_file_path = quals_file_pathself.name = Nones
elf.lazy_init = lazy_initself.pos = 0self.id  = Noneself.quals = Noneself.quals_
int = Noneself.ids = []self.file_pointer = open(self.quals_file_path)self.file_p
ointer.seek(0)if self.lazy_init:self.total_quals = Noneelse:self.total_quals = l
en([l for l in self.
>temp_44|source:-*- coding: utf-8 -*-|start:3197|stop:3697
 Noneself.id = self.file_pointer.readline()[1:].strip()sequence = ''while 1:line
 = self.file_pointer.readline()if not line:if len(sequence):self.seq = sequences
elf.pos += 1return Trueelse:return Falseif line.startswith('>'):self.file_pointe
r.seek(self.file_pointer.tell() - len(line))breaksequence += line.strip()self.se
q = sequence if self.allow_mixed_case else sequence.upper()self.pos += 1return T
ruedef get_seq_by_read_id(self, read_id):self.reset()while self.next():if self.i
d == read_id:return 
>temp_45|source:-*- coding: utf-8 -*-|start:3470|stop:3970
 len(line))breaksequence += line.strip()self.seq = sequence if self.allow_mixed_
case else sequence.upper()self.pos += 1return Truedef get_seq_by_read_id(self, r
ead_id):self.reset()while self.next():if self.id == read_id:return self.seqretur
n Falsedef close(self):self.file_pointer.close()def reset(self):self.pos = 0self
.id  = Noneself.seq = Noneself.ids = []self.file_pointer.seek(0)def visualize_se
quence_length_distribution(self, title, dest = None, max_seq_len = None, xtickst
ep = None, ytickstep
>temp_46|source:-*- coding: utf-8 -*-|start:3202|stop:3702
self.id = self.file_pointer.readline()[1:].strip()sequence = ''while 1:line = se
lf.file_pointer.readline()if not line:if len(sequence):self.seq = sequenceself.p
os += 1return Trueelse:return Falseif line.startswith('>'):self.file_pointer.see
k(self.file_pointer.tell() - len(line))breaksequence += line.strip()self.seq = s
equence if self.allow_mixed_case else sequence.upper()self.pos += 1return Truede
f get_seq_by_read_id(self, read_id):self.reset()while self.next():if self.id == 
read_id:return self.
>temp_47|source:-*- coding: utf-8 -*-|start:1395|stop:1895
% (self.fasta.pos))sys.stderr.flush()self.ids.append(self.fasta.id)self.sequence
s.append(self.fasta.seq)sys.stderr.write('\n')def close(self):self.fasta.close()
class SequenceSource():def __init__(self, fasta_file_path, lazy_init = True, uni
que = False, allow_mixed_case = False):self.fasta_file_path = fasta_file_pathsel
f.name = Noneself.lazy_init = lazy_initself.allow_mixed_case = allow_mixed_cases
elf.pos = 0self.id  = Noneself.seq = Noneself.ids = []self.unique = uniqueself.u
nique_hash_dict = {}
>temp_48|source:-*- coding: utf-8 -*-|start:297|stop:797
se read the docs/COPYING file.import sysimport numpyimport hashlibclass FastaOut
put:def __init__(self, output_file_path):self.output_file_path = output_file_pat
hself.output_file_obj = open(output_file_path, 'w')def store(self, entry, split 
= True, store_frequencies = True):if entry.unique and store_frequencies:self.wri
te_id('%s|%s' % (entry.id, 'frequency:%d' % len(entry.ids)))else:self.write_id(e
ntry.id)self.write_seq(entry.seq, split)def write_id(self, id):self.output_file_
obj.write('>%s\n' % 
>temp_49|source:-*- coding: utf-8 -*-|start:5219|stop:5719
0, max(seq_len_distribution) + 1, ytickstep),[y for y in range(0, max(seq_len_di
stribution) + 1, ytickstep)],size='xx-small')plt.xlim(xmin = 0, xmax = max_seq_l
en)plt.ylim(ymin = 0, ymax = max(seq_len_distribution) + (max(seq_len_distributi
on) / 20.0))plt.figtext(0.5, 0.96, '%s' % (title), weight = 'black', size = 'xx-
large', ha = 'center')ax1 = plt.subplot(gs[9])plt.rcParams.update({'axes.edgecol
or' : 20})plt.grid(False)plt.yticks([])plt.xticks([])plt.text(0.02, 0.5, 'total:
 %s / mean: %.2f / s
>temp_50|source:-*- coding: utf-8 -*-|start:2224|stop:2724
hile self.next_regular():hash = hashlib.sha1(self.seq.upper()).hexdigest()if has
h in self.unique_hash_dict:self.unique_hash_dict[hash]['ids'].append(self.id)sel
f.unique_hash_dict[hash]['count'] += 1else:self.unique_hash_dict[hash] = {'id' :
 self.id,'ids': [self.id],'seq': self.seq,'count': 1}self.unique_hash_list = [i[
1] for i in sorted([(self.unique_hash_dict[hash]['count'], hash)\for hash in sel
f.unique_hash_dict], reverse = True)]self.total_unique = len(self.unique_hash_di
ct)self.reset()def n
>temp_51|source:-*- coding: utf-8 -*-|start:4380|stop:4880
 or 10)seq_len_distribution = [0] * (max_seq_len + 1)for l in sequence_lengths:s
eq_len_distribution[l] += 1fig = plt.figure(figsize = (16, 12))plt.rcParams.upda
te({'axes.linewidth' : 0.9})plt.rc('grid', color='0.50', linestyle='-', linewidt
h=0.1)gs = gridspec.GridSpec(10, 1)ax1 = plt.subplot(gs[0:8])plt.grid(True)plt.s
ubplots_adjust(left=0.05, bottom = 0.03, top = 0.95, right = 0.98)plt.plot(seq_l
en_distribution, color = 'black', alpha = 0.3)plt.fill_between(range(0, max_seq_
len + 1), seq_len_di
>temp_52|source:-*- coding: utf-8 -*-|start:2167|stop:2667
nique:self.init_unique_hash()def init_unique_hash(self):while self.next_regular(
):hash = hashlib.sha1(self.seq.upper()).hexdigest()if hash in self.unique_hash_d
ict:self.unique_hash_dict[hash]['ids'].append(self.id)self.unique_hash_dict[hash
]['count'] += 1else:self.unique_hash_dict[hash] = {'id' : self.id,'ids': [self.i
d],'seq': self.seq,'count': 1}self.unique_hash_list = [i[1] for i in sorted([(se
lf.unique_hash_dict[hash]['count'], hash)\for hash in self.unique_hash_dict], re
verse = True)]self.t
>temp_53|source:-*- coding: utf-8 -*-|start:4642|stop:5142
idSpec(10, 1)ax1 = plt.subplot(gs[0:8])plt.grid(True)plt.subplots_adjust(left=0.
05, bottom = 0.03, top = 0.95, right = 0.98)plt.plot(seq_len_distribution, color
 = 'black', alpha = 0.3)plt.fill_between(range(0, max_seq_len + 1), seq_len_dist
ribution, y2 = 0, color = 'black', alpha = 0.15)plt.ylabel('number of sequences'
)plt.xlabel('sequence length')if xtickstep == None:xtickstep = (max_seq_len / 50
) or 1if ytickstep == None:ytickstep = max(seq_len_distribution) / 20 or 1plt.xt
icks(range(xtickstep
>temp_54|source:-*- coding: utf-8 -*-|start:2941|stop:3441
nique_hash_list[self.pos]]self.pos += 1self.seq = hash_entry['seq'] if self.allo
w_mixed_case else hash_entry['seq'].upper()self.id  = hash_entry['id']self.ids =
 hash_entry['ids']return Trueelse:return Falseelse:return Falsedef next_regular(
self):self.seq = Noneself.id = self.file_pointer.readline()[1:].strip()sequence 
= ''while 1:line = self.file_pointer.readline()if not line:if len(sequence):self
.seq = sequenceself.pos += 1return Trueelse:return Falseif line.startswith('>'):
self.file_pointer.se
>temp_55|source:-*- coding: utf-8 -*-|start:6441|stop:6941
n self.file_pointer.readlines() if l.startswith('>')])self.reset()def next(self)
:self.id = self.file_pointer.readline()[1:].strip()self.quals = Noneself.quals_i
nt = Nonequalscores = ''while 1:line = self.file_pointer.readline()if not line:i
f len(qualscores):self.quals = qualscores.strip()self.quals_int = [int(q) for q 
in self.quals.split()]self.pos += 1return Trueelse:return Falseif line.startswit
h('>'):self.file_pointer.seek(self.file_pointer.tell() - len(line))breakqualscor
es += ' ' + line.str
>temp_56|source:-*- coding: utf-8 -*-|start:2016|stop:2516
)if self.lazy_init:self.total_seq = Noneelse:self.total_seq = len([l for l in se
lf.file_pointer.readlines() if l.startswith('>')])self.reset()if self.unique:sel
f.init_unique_hash()def init_unique_hash(self):while self.next_regular():hash = 
hashlib.sha1(self.seq.upper()).hexdigest()if hash in self.unique_hash_dict:self.
unique_hash_dict[hash]['ids'].append(self.id)self.unique_hash_dict[hash]['count'
] += 1else:self.unique_hash_dict[hash] = {'id' : self.id,'ids': [self.id],'seq':
 self.seq,'count': 1
>temp_57|source:-*- coding: utf-8 -*-|start:4425|stop:4925
len + 1)for l in sequence_lengths:seq_len_distribution[l] += 1fig = plt.figure(f
igsize = (16, 12))plt.rcParams.update({'axes.linewidth' : 0.9})plt.rc('grid', co
lor='0.50', linestyle='-', linewidth=0.1)gs = gridspec.GridSpec(10, 1)ax1 = plt.
subplot(gs[0:8])plt.grid(True)plt.subplots_adjust(left=0.05, bottom = 0.03, top 
= 0.95, right = 0.98)plt.plot(seq_len_distribution, color = 'black', alpha = 0.3
)plt.fill_between(range(0, max_seq_len + 1), seq_len_distribution, y2 = 0, color
 = 'black', alpha = 
>temp_58|source:-*- coding: utf-8 -*-|start:3982|stop:4482
ort matplotlib.pyplot as pltimport matplotlib.gridspec as gridspecsequence_lengt
hs = []self.reset()while self.next():if self.pos % 10000 == 0 or self.pos == 1:s
ys.stderr.write('\r[fastalib] Reading: %s' % (self.pos))sys.stderr.flush()sequen
ce_lengths.append(len(self.seq))self.reset()sys.stderr.write('\n')if not max_seq
_len:max_seq_len = max(sequence_lengths) + (int(max(sequence_lengths) / 100.0) o
r 10)seq_len_distribution = [0] * (max_seq_len + 1)for l in sequence_lengths:seq
_len_distribution[l]
>temp_59|source:-*- coding: utf-8 -*-|start:5872|stop:6372
gths)),\va = 'center', alpha = 0.8, size = 'x-large')if dest == None:dest = self
.fasta_file_pathtry:plt.savefig(dest + '.pdf')except:plt.savefig(dest + '.png')t
ry:plt.show()except:passreturnclass QualSource:def __init__(self, quals_file_pat
h, lazy_init = True):self.quals_file_path = quals_file_pathself.name = Noneself.
lazy_init = lazy_initself.pos = 0self.id  = Noneself.quals = Noneself.quals_int 
= Noneself.ids = []self.file_pointer = open(self.quals_file_path)self.file_point
er.seek(0)if self.la
>temp_60|source:-*- coding: utf-8 -*-|start:4620|stop:5120
h=0.1)gs = gridspec.GridSpec(10, 1)ax1 = plt.subplot(gs[0:8])plt.grid(True)plt.s
ubplots_adjust(left=0.05, bottom = 0.03, top = 0.95, right = 0.98)plt.plot(seq_l
en_distribution, color = 'black', alpha = 0.3)plt.fill_between(range(0, max_seq_
len + 1), seq_len_distribution, y2 = 0, color = 'black', alpha = 0.15)plt.ylabel
('number of sequences')plt.xlabel('sequence length')if xtickstep == None:xtickst
ep = (max_seq_len / 50) or 1if ytickstep == None:ytickstep = max(seq_len_distrib
ution) / 20 or 1plt.
>temp_61|source:-*- coding: utf-8 -*-|start:4922|stop:5422
 = 0.15)plt.ylabel('number of sequences')plt.xlabel('sequence length')if xtickst
ep == None:xtickstep = (max_seq_len / 50) or 1if ytickstep == None:ytickstep = m
ax(seq_len_distribution) / 20 or 1plt.xticks(range(xtickstep, max_seq_len + 1, x
tickstep), rotation=90, size='xx-small')plt.yticks(range(0, max(seq_len_distribu
tion) + 1, ytickstep),[y for y in range(0, max(seq_len_distribution) + 1, yticks
tep)],size='xx-small')plt.xlim(xmin = 0, xmax = max_seq_len)plt.ylim(ymin = 0, y
max = max(seq_len_di
>temp_62|source:-*- coding: utf-8 -*-|start:1777|stop:2277
 allow_mixed_caseself.pos = 0self.id  = Noneself.seq = Noneself.ids = []self.uni
que = uniqueself.unique_hash_dict = {}self.unique_hash_list = []self.unique_next
_hash = 0self.file_pointer = open(self.fasta_file_path)self.file_pointer.seek(0)
if self.lazy_init:self.total_seq = Noneelse:self.total_seq = len([l for l in sel
f.file_pointer.readlines() if l.startswith('>')])self.reset()if self.unique:self
.init_unique_hash()def init_unique_hash(self):while self.next_regular():hash = h
ashlib.sha1(self.seq
>temp_63|source:-*- coding: utf-8 -*-|start:1171|stop:1671
t__(self, f_name):self.ids = []self.sequences = []self.fasta = SequenceSource(f_
name)while self.fasta.next():if self.fasta.pos % 1000 == 0 or self.fasta.pos == 
1:sys.stderr.write('\r[fastalib] Reading FASTA into memory: %s' % (self.fasta.po
s))sys.stderr.flush()self.ids.append(self.fasta.id)self.sequences.append(self.fa
sta.seq)sys.stderr.write('\n')def close(self):self.fasta.close()class SequenceSo
urce():def __init__(self, fasta_file_path, lazy_init = True, unique = False, all
ow_mixed_case = Fals
>temp_64|source:-*- coding: utf-8 -*-|start:2346|stop:2846
h_dict[hash]['ids'].append(self.id)self.unique_hash_dict[hash]['count'] += 1else
:self.unique_hash_dict[hash] = {'id' : self.id,'ids': [self.id],'seq': self.seq,
'count': 1}self.unique_hash_list = [i[1] for i in sorted([(self.unique_hash_dict
[hash]['count'], hash)\for hash in self.unique_hash_dict], reverse = True)]self.
total_unique = len(self.unique_hash_dict)self.reset()def next(self):if self.uniq
ue:return self.next_unique()else:return self.next_regular()def next_unique(self)
:if self.unique:if s
>temp_65|source:-*- coding: utf-8 -*-|start:2352|stop:2852
[hash]['ids'].append(self.id)self.unique_hash_dict[hash]['count'] += 1else:self.
unique_hash_dict[hash] = {'id' : self.id,'ids': [self.id],'seq': self.seq,'count
': 1}self.unique_hash_list = [i[1] for i in sorted([(self.unique_hash_dict[hash]
['count'], hash)\for hash in self.unique_hash_dict], reverse = True)]self.total_
unique = len(self.unique_hash_dict)self.reset()def next(self):if self.unique:ret
urn self.next_unique()else:return self.next_regular()def next_unique(self):if se
lf.unique:if self.to
>temp_66|source:-*- coding: utf-8 -*-|start:1978|stop:2478
sta_file_path)self.file_pointer.seek(0)if self.lazy_init:self.total_seq = Noneel
se:self.total_seq = len([l for l in self.file_pointer.readlines() if l.startswit
h('>')])self.reset()if self.unique:self.init_unique_hash()def init_unique_hash(s
elf):while self.next_regular():hash = hashlib.sha1(self.seq.upper()).hexdigest()
if hash in self.unique_hash_dict:self.unique_hash_dict[hash]['ids'].append(self.
id)self.unique_hash_dict[hash]['count'] += 1else:self.unique_hash_dict[hash] = {
'id' : self.id,'ids'
>temp_67|source:-*- coding: utf-8 -*-|start:4272|stop:4772
derr.write('\n')if not max_seq_len:max_seq_len = max(sequence_lengths) + (int(ma
x(sequence_lengths) / 100.0) or 10)seq_len_distribution = [0] * (max_seq_len + 1
)for l in sequence_lengths:seq_len_distribution[l] += 1fig = plt.figure(figsize 
= (16, 12))plt.rcParams.update({'axes.linewidth' : 0.9})plt.rc('grid', color='0.
50', linestyle='-', linewidth=0.1)gs = gridspec.GridSpec(10, 1)ax1 = plt.subplot
(gs[0:8])plt.grid(True)plt.subplots_adjust(left=0.05, bottom = 0.03, top = 0.95,
 right = 0.98)plt.pl
>temp_68|source:-*- coding: utf-8 -*-|start:4910|stop:5410
lack', alpha = 0.15)plt.ylabel('number of sequences')plt.xlabel('sequence length
')if xtickstep == None:xtickstep = (max_seq_len / 50) or 1if ytickstep == None:y
tickstep = max(seq_len_distribution) / 20 or 1plt.xticks(range(xtickstep, max_se
q_len + 1, xtickstep), rotation=90, size='xx-small')plt.yticks(range(0, max(seq_
len_distribution) + 1, ytickstep),[y for y in range(0, max(seq_len_distribution)
 + 1, ytickstep)],size='xx-small')plt.xlim(xmin = 0, xmax = max_seq_len)plt.ylim
(ymin = 0, ymax = ma
>temp_69|source:-*- coding: utf-8 -*-|start:4297|stop:4797
x_seq_len:max_seq_len = max(sequence_lengths) + (int(max(sequence_lengths) / 100
.0) or 10)seq_len_distribution = [0] * (max_seq_len + 1)for l in sequence_length
s:seq_len_distribution[l] += 1fig = plt.figure(figsize = (16, 12))plt.rcParams.u
pdate({'axes.linewidth' : 0.9})plt.rc('grid', color='0.50', linestyle='-', linew
idth=0.1)gs = gridspec.GridSpec(10, 1)ax1 = plt.subplot(gs[0:8])plt.grid(True)pl
t.subplots_adjust(left=0.05, bottom = 0.03, top = 0.95, right = 0.98)plt.plot(se
q_len_distribution, 
>temp_70|source:-*- coding: utf-8 -*-|start:2776|stop:3276
se:return self.next_regular()def next_unique(self):if self.unique:if self.total_
unique > 0 and self.pos < self.total_unique:hash_entry = self.unique_hash_dict[s
elf.unique_hash_list[self.pos]]self.pos += 1self.seq = hash_entry['seq'] if self
.allow_mixed_case else hash_entry['seq'].upper()self.id  = hash_entry['id']self.
ids = hash_entry['ids']return Trueelse:return Falseelse:return Falsedef next_reg
ular(self):self.seq = Noneself.id = self.file_pointer.readline()[1:].strip()sequ
ence = ''while 1:lin
>temp_71|source:-*- coding: utf-8 -*-|start:361|stop:861
ibclass FastaOutput:def __init__(self, output_file_path):self.output_file_path =
 output_file_pathself.output_file_obj = open(output_file_path, 'w')def store(sel
f, entry, split = True, store_frequencies = True):if entry.unique and store_freq
uencies:self.write_id('%s|%s' % (entry.id, 'frequency:%d' % len(entry.ids)))else
:self.write_id(entry.id)self.write_seq(entry.seq, split)def write_id(self, id):s
elf.output_file_obj.write('>%s\n' % id)def write_seq(self, seq, split = True):if
 split:seq = self.sp
>temp_72|source:-*- coding: utf-8 -*-|start:2404|stop:2904
ash]['count'] += 1else:self.unique_hash_dict[hash] = {'id' : self.id,'ids': [sel
f.id],'seq': self.seq,'count': 1}self.unique_hash_list = [i[1] for i in sorted([
(self.unique_hash_dict[hash]['count'], hash)\for hash in self.unique_hash_dict],
 reverse = True)]self.total_unique = len(self.unique_hash_dict)self.reset()def n
ext(self):if self.unique:return self.next_unique()else:return self.next_regular(
)def next_unique(self):if self.unique:if self.total_unique > 0 and self.pos < se
lf.total_unique:hash
>temp_73|source:-*- coding: utf-8 -*-|start:6076|stop:6576
ce:def __init__(self, quals_file_path, lazy_init = True):self.quals_file_path = 
quals_file_pathself.name = Noneself.lazy_init = lazy_initself.pos = 0self.id  = 
Noneself.quals = Noneself.quals_int = Noneself.ids = []self.file_pointer = open(
self.quals_file_path)self.file_pointer.seek(0)if self.lazy_init:self.total_quals
 = Noneelse:self.total_quals = len([l for l in self.file_pointer.readlines() if 
l.startswith('>')])self.reset()def next(self):self.id = self.file_pointer.readli
ne()[1:].strip()self
>temp_74|source:-*- coding: utf-8 -*-|start:4607|stop:5107
'-', linewidth=0.1)gs = gridspec.GridSpec(10, 1)ax1 = plt.subplot(gs[0:8])plt.gr
id(True)plt.subplots_adjust(left=0.05, bottom = 0.03, top = 0.95, right = 0.98)p
lt.plot(seq_len_distribution, color = 'black', alpha = 0.3)plt.fill_between(rang
e(0, max_seq_len + 1), seq_len_distribution, y2 = 0, color = 'black', alpha = 0.
15)plt.ylabel('number of sequences')plt.xlabel('sequence length')if xtickstep ==
 None:xtickstep = (max_seq_len / 50) or 1if ytickstep == None:ytickstep = max(se
q_len_distribution) 
>temp_75|source:-*- coding: utf-8 -*-|start:6092|stop:6592
self, quals_file_path, lazy_init = True):self.quals_file_path = quals_file_paths
elf.name = Noneself.lazy_init = lazy_initself.pos = 0self.id  = Noneself.quals =
 Noneself.quals_int = Noneself.ids = []self.file_pointer = open(self.quals_file_
path)self.file_pointer.seek(0)if self.lazy_init:self.total_quals = Noneelse:self
.total_quals = len([l for l in self.file_pointer.readlines() if l.startswith('>'
)])self.reset()def next(self):self.id = self.file_pointer.readline()[1:].strip()
self.quals = Nonesel
>temp_76|source:-*- coding: utf-8 -*-|start:1434|stop:1934
lf.ids.append(self.fasta.id)self.sequences.append(self.fasta.seq)sys.stderr.writ
e('\n')def close(self):self.fasta.close()class SequenceSource():def __init__(sel
f, fasta_file_path, lazy_init = True, unique = False, allow_mixed_case = False):
self.fasta_file_path = fasta_file_pathself.name = Noneself.lazy_init = lazy_init
self.allow_mixed_case = allow_mixed_caseself.pos = 0self.id  = Noneself.seq = No
neself.ids = []self.unique = uniqueself.unique_hash_dict = {}self.unique_hash_li
st = []self.unique_n
>temp_77|source:-*- coding: utf-8 -*-|start:4514|stop:5014
(16, 12))plt.rcParams.update({'axes.linewidth' : 0.9})plt.rc('grid', color='0.50
', linestyle='-', linewidth=0.1)gs = gridspec.GridSpec(10, 1)ax1 = plt.subplot(g
s[0:8])plt.grid(True)plt.subplots_adjust(left=0.05, bottom = 0.03, top = 0.95, r
ight = 0.98)plt.plot(seq_len_distribution, color = 'black', alpha = 0.3)plt.fill
_between(range(0, max_seq_len + 1), seq_len_distribution, y2 = 0, color = 'black
', alpha = 0.15)plt.ylabel('number of sequences')plt.xlabel('sequence length')if
 xtickstep == None:x
>temp_78|source:-*- coding: utf-8 -*-|start:1845|stop:2345
= []self.unique = uniqueself.unique_hash_dict = {}self.unique_hash_list = []self
.unique_next_hash = 0self.file_pointer = open(self.fasta_file_path)self.file_poi
nter.seek(0)if self.lazy_init:self.total_seq = Noneelse:self.total_seq = len([l 
for l in self.file_pointer.readlines() if l.startswith('>')])self.reset()if self
.unique:self.init_unique_hash()def init_unique_hash(self):while self.next_regula
r():hash = hashlib.sha1(self.seq.upper()).hexdigest()if hash in self.unique_hash
_dict:self.unique_ha
>temp_79|source:-*- coding: utf-8 -*-|start:2352|stop:2852
[hash]['ids'].append(self.id)self.unique_hash_dict[hash]['count'] += 1else:self.
unique_hash_dict[hash] = {'id' : self.id,'ids': [self.id],'seq': self.seq,'count
': 1}self.unique_hash_list = [i[1] for i in sorted([(self.unique_hash_dict[hash]
['count'], hash)\for hash in self.unique_hash_dict], reverse = True)]self.total_
unique = len(self.unique_hash_dict)self.reset()def next(self):if self.unique:ret
urn self.next_unique()else:return self.next_regular()def next_unique(self):if se
lf.unique:if self.to
>temp_80|source:-*- coding: utf-8 -*-|start:4728|stop:5228
ttom = 0.03, top = 0.95, right = 0.98)plt.plot(seq_len_distribution, color = 'bl
ack', alpha = 0.3)plt.fill_between(range(0, max_seq_len + 1), seq_len_distributi
on, y2 = 0, color = 'black', alpha = 0.15)plt.ylabel('number of sequences')plt.x
label('sequence length')if xtickstep == None:xtickstep = (max_seq_len / 50) or 1
if ytickstep == None:ytickstep = max(seq_len_distribution) / 20 or 1plt.xticks(r
ange(xtickstep, max_seq_len + 1, xtickstep), rotation=90, size='xx-small')plt.yt
icks(range(0, max(se
>temp_81|source:-*- coding: utf-8 -*-|start:5674|stop:6174
t.text(0.02, 0.5, 'total: %s / mean: %.2f / std: %.2f / min: %s / max: %s'\% (le
n(sequence_lengths),numpy.mean(sequence_lengths), numpy.std(sequence_lengths),\m
in(sequence_lengths),\max(sequence_lengths)),\va = 'center', alpha = 0.8, size =
 'x-large')if dest == None:dest = self.fasta_file_pathtry:plt.savefig(dest + '.p
df')except:plt.savefig(dest + '.png')try:plt.show()except:passreturnclass QualSo
urce:def __init__(self, quals_file_path, lazy_init = True):self.quals_file_path 
= quals_file_pathsel
>temp_82|source:-*- coding: utf-8 -*-|start:1758|stop:2258
.allow_mixed_case = allow_mixed_caseself.pos = 0self.id  = Noneself.seq = Nonese
lf.ids = []self.unique = uniqueself.unique_hash_dict = {}self.unique_hash_list =
 []self.unique_next_hash = 0self.file_pointer = open(self.fasta_file_path)self.f
ile_pointer.seek(0)if self.lazy_init:self.total_seq = Noneelse:self.total_seq = 
len([l for l in self.file_pointer.readlines() if l.startswith('>')])self.reset()
if self.unique:self.init_unique_hash()def init_unique_hash(self):while self.next
_regular():hash = ha
>temp_83|source:-*- coding: utf-8 -*-|start:4840|stop:5340
en(range(0, max_seq_len + 1), seq_len_distribution, y2 = 0, color = 'black', alp
ha = 0.15)plt.ylabel('number of sequences')plt.xlabel('sequence length')if xtick
step == None:xtickstep = (max_seq_len / 50) or 1if ytickstep == None:ytickstep =
 max(seq_len_distribution) / 20 or 1plt.xticks(range(xtickstep, max_seq_len + 1,
 xtickstep), rotation=90, size='xx-small')plt.yticks(range(0, max(seq_len_distri
bution) + 1, ytickstep),[y for y in range(0, max(seq_len_distribution) + 1, ytic
kstep)],size='xx-sma
>temp_84|source:-*- coding: utf-8 -*-|start:2734|stop:3234
if self.unique:return self.next_unique()else:return self.next_regular()def next_
unique(self):if self.unique:if self.total_unique > 0 and self.pos < self.total_u
nique:hash_entry = self.unique_hash_dict[self.unique_hash_list[self.pos]]self.po
s += 1self.seq = hash_entry['seq'] if self.allow_mixed_case else hash_entry['seq
'].upper()self.id  = hash_entry['id']self.ids = hash_entry['ids']return Trueelse
:return Falseelse:return Falsedef next_regular(self):self.seq = Noneself.id = se
lf.file_pointer.read
>temp_85|source:-*- coding: utf-8 -*-|start:3165|stop:3665
ef next_regular(self):self.seq = Noneself.id = self.file_pointer.readline()[1:].
strip()sequence = ''while 1:line = self.file_pointer.readline()if not line:if le
n(sequence):self.seq = sequenceself.pos += 1return Trueelse:return Falseif line.
startswith('>'):self.file_pointer.seek(self.file_pointer.tell() - len(line))brea
ksequence += line.strip()self.seq = sequence if self.allow_mixed_case else seque
nce.upper()self.pos += 1return Truedef get_seq_by_read_id(self, read_id):self.re
set()while self.next
>temp_86|source:-*- coding: utf-8 -*-|start:6745|stop:7245
= [int(q) for q in self.quals.split()]self.pos += 1return Trueelse:return Falsei
f line.startswith('>'):self.file_pointer.seek(self.file_pointer.tell() - len(lin
e))breakqualscores += ' ' + line.strip()self.quals = qualscores.strip()self.qual
s_int = [int(q) for q in self.quals.split()]self.pos += 1return Truedef close(se
lf):self.file_pointer.close()def reset(self):self.pos = 0self.id  = Noneself.qua
ls = Noneself.quals_int = Noneself.ids = []self.file_pointer.seek(0)if __name__ 
== '__main__':fasta 
>temp_87|source:-*- coding: utf-8 -*-|start:817|stop:1317
f, seq, split = True):if split:seq = self.split(seq)self.output_file_obj.write('
%s\n' % seq)def split(self, sequence, piece_length = 80):ticks = range(0, len(se
quence), piece_length) + [len(sequence)]return '\n'.join([sequence[ticks[x]:tick
s[x + 1]] for x in range(0, len(ticks) - 1)])def close(self):self.output_file_ob
j.close()class ReadFasta:def __init__(self, f_name):self.ids = []self.sequences 
= []self.fasta = SequenceSource(f_name)while self.fasta.next():if self.fasta.pos
 % 1000 == 0 or self
>temp_88|source:-*- coding: utf-8 -*-|start:6848|stop:7348
self.file_pointer.seek(self.file_pointer.tell() - len(line))breakqualscores += '
 ' + line.strip()self.quals = qualscores.strip()self.quals_int = [int(q) for q i
n self.quals.split()]self.pos += 1return Truedef close(self):self.file_pointer.c
lose()def reset(self):self.pos = 0self.id  = Noneself.quals = Noneself.quals_int
 = Noneself.ids = []self.file_pointer.seek(0)if __name__ == '__main__':fasta = S
equenceSource(sys.argv[1])fasta.visualize_sequence_length_distribution(title = s
ys.argv[2] if len(sy
>temp_89|source:-*- coding: utf-8 -*-|start:4506|stop:5006
gsize = (16, 12))plt.rcParams.update({'axes.linewidth' : 0.9})plt.rc('grid', col
or='0.50', linestyle='-', linewidth=0.1)gs = gridspec.GridSpec(10, 1)ax1 = plt.s
ubplot(gs[0:8])plt.grid(True)plt.subplots_adjust(left=0.05, bottom = 0.03, top =
 0.95, right = 0.98)plt.plot(seq_len_distribution, color = 'black', alpha = 0.3)
plt.fill_between(range(0, max_seq_len + 1), seq_len_distribution, y2 = 0, color 
= 'black', alpha = 0.15)plt.ylabel('number of sequences')plt.xlabel('sequence le
ngth')if xtickstep =
>temp_90|source:-*- coding: utf-8 -*-|start:6592|stop:7092
f.quals_int = Nonequalscores = ''while 1:line = self.file_pointer.readline()if n
ot line:if len(qualscores):self.quals = qualscores.strip()self.quals_int = [int(
q) for q in self.quals.split()]self.pos += 1return Trueelse:return Falseif line.
startswith('>'):self.file_pointer.seek(self.file_pointer.tell() - len(line))brea
kqualscores += ' ' + line.strip()self.quals = qualscores.strip()self.quals_int =
 [int(q) for q in self.quals.split()]self.pos += 1return Truedef close(self):sel
f.file_pointer.close
>temp_91|source:-*- coding: utf-8 -*-|start:4512|stop:5012
= (16, 12))plt.rcParams.update({'axes.linewidth' : 0.9})plt.rc('grid', color='0.
50', linestyle='-', linewidth=0.1)gs = gridspec.GridSpec(10, 1)ax1 = plt.subplot
(gs[0:8])plt.grid(True)plt.subplots_adjust(left=0.05, bottom = 0.03, top = 0.95,
 right = 0.98)plt.plot(seq_len_distribution, color = 'black', alpha = 0.3)plt.fi
ll_between(range(0, max_seq_len + 1), seq_len_distribution, y2 = 0, color = 'bla
ck', alpha = 0.15)plt.ylabel('number of sequences')plt.xlabel('sequence length')
if xtickstep == None
>temp_92|source:-*- coding: utf-8 -*-|start:1331|stop:1831
1:sys.stderr.write('\r[fastalib] Reading FASTA into memory: %s' % (self.fasta.po
s))sys.stderr.flush()self.ids.append(self.fasta.id)self.sequences.append(self.fa
sta.seq)sys.stderr.write('\n')def close(self):self.fasta.close()class SequenceSo
urce():def __init__(self, fasta_file_path, lazy_init = True, unique = False, all
ow_mixed_case = False):self.fasta_file_path = fasta_file_pathself.name = Nonesel
f.lazy_init = lazy_initself.allow_mixed_case = allow_mixed_caseself.pos = 0self.
id  = Noneself.seq =
>temp_93|source:-*- coding: utf-8 -*-|start:3260|stop:3760
 = ''while 1:line = self.file_pointer.readline()if not line:if len(sequence):sel
f.seq = sequenceself.pos += 1return Trueelse:return Falseif line.startswith('>')
:self.file_pointer.seek(self.file_pointer.tell() - len(line))breaksequence += li
ne.strip()self.seq = sequence if self.allow_mixed_case else sequence.upper()self
.pos += 1return Truedef get_seq_by_read_id(self, read_id):self.reset()while self
.next():if self.id == read_id:return self.seqreturn Falsedef close(self):self.fi
le_pointer.close()de
>temp_94|source:-*- coding: utf-8 -*-|start:5933|stop:6433
== None:dest = self.fasta_file_pathtry:plt.savefig(dest + '.pdf')except:plt.save
fig(dest + '.png')try:plt.show()except:passreturnclass QualSource:def __init__(s
elf, quals_file_path, lazy_init = True):self.quals_file_path = quals_file_pathse
lf.name = Noneself.lazy_init = lazy_initself.pos = 0self.id  = Noneself.quals = 
Noneself.quals_int = Noneself.ids = []self.file_pointer = open(self.quals_file_p
ath)self.file_pointer.seek(0)if self.lazy_init:self.total_quals = Noneelse:self.
total_quals = len([l
>temp_95|source:-*- coding: utf-8 -*-|start:1688|stop:2188
e_path = fasta_file_pathself.name = Noneself.lazy_init = lazy_initself.allow_mix
ed_case = allow_mixed_caseself.pos = 0self.id  = Noneself.seq = Noneself.ids = [
]self.unique = uniqueself.unique_hash_dict = {}self.unique_hash_list = []self.un
ique_next_hash = 0self.file_pointer = open(self.fasta_file_path)self.file_pointe
r.seek(0)if self.lazy_init:self.total_seq = Noneelse:self.total_seq = len([l for
 l in self.file_pointer.readlines() if l.startswith('>')])self.reset()if self.un
ique:self.init_uniqu
>temp_96|source:-*- coding: utf-8 -*-|start:4080|stop:4580
)while self.next():if self.pos % 10000 == 0 or self.pos == 1:sys.stderr.write('\
r[fastalib] Reading: %s' % (self.pos))sys.stderr.flush()sequence_lengths.append(
len(self.seq))self.reset()sys.stderr.write('\n')if not max_seq_len:max_seq_len =
 max(sequence_lengths) + (int(max(sequence_lengths) / 100.0) or 10)seq_len_distr
ibution = [0] * (max_seq_len + 1)for l in sequence_lengths:seq_len_distribution[
l] += 1fig = plt.figure(figsize = (16, 12))plt.rcParams.update({'axes.linewidth'
 : 0.9})plt.rc('grid
>temp_97|source:-*- coding: utf-8 -*-|start:4545|stop:5045
axes.linewidth' : 0.9})plt.rc('grid', color='0.50', linestyle='-', linewidth=0.1
)gs = gridspec.GridSpec(10, 1)ax1 = plt.subplot(gs[0:8])plt.grid(True)plt.subplo
ts_adjust(left=0.05, bottom = 0.03, top = 0.95, right = 0.98)plt.plot(seq_len_di
stribution, color = 'black', alpha = 0.3)plt.fill_between(range(0, max_seq_len +
 1), seq_len_distribution, y2 = 0, color = 'black', alpha = 0.15)plt.ylabel('num
ber of sequences')plt.xlabel('sequence length')if xtickstep == None:xtickstep = 
(max_seq_len / 50) o
>temp_98|source:-*- coding: utf-8 -*-|start:5043|stop:5543
 or 1if ytickstep == None:ytickstep = max(seq_len_distribution) / 20 or 1plt.xti
cks(range(xtickstep, max_seq_len + 1, xtickstep), rotation=90, size='xx-small')p
lt.yticks(range(0, max(seq_len_distribution) + 1, ytickstep),[y for y in range(0
, max(seq_len_distribution) + 1, ytickstep)],size='xx-small')plt.xlim(xmin = 0, 
xmax = max_seq_len)plt.ylim(ymin = 0, ymax = max(seq_len_distribution) + (max(se
q_len_distribution) / 20.0))plt.figtext(0.5, 0.96, '%s' % (title), weight = 'bla
ck', size = 'xx-larg
>temp_99|source:-*- coding: utf-8 -*-|start:6727|stop:7227
p()self.quals_int = [int(q) for q in self.quals.split()]self.pos += 1return True
else:return Falseif line.startswith('>'):self.file_pointer.seek(self.file_pointe
r.tell() - len(line))breakqualscores += ' ' + line.strip()self.quals = qualscore
s.strip()self.quals_int = [int(q) for q in self.quals.split()]self.pos += 1retur
n Truedef close(self):self.file_pointer.close()def reset(self):self.pos = 0self.
id  = Noneself.quals = Noneself.quals_int = Noneself.ids = []self.file_pointer.s
eek(0)if __name__ ==
>temp_100|source:-*- coding: utf-8 -*-|start:890|stop:1390
write('%s\n' % seq)def split(self, sequence, piece_length = 80):ticks = range(0,
 len(sequence), piece_length) + [len(sequence)]return '\n'.join([sequence[ticks[
x]:ticks[x + 1]] for x in range(0, len(ticks) - 1)])def close(self):self.output_
file_obj.close()class ReadFasta:def __init__(self, f_name):self.ids = []self.seq
uences = []self.fasta = SequenceSource(f_name)while self.fasta.next():if self.fa
sta.pos % 1000 == 0 or self.fasta.pos == 1:sys.stderr.write('\r[fastalib] Readin
g FASTA into memory:
>temp_101|source:-*- coding: utf-8 -*-|start:4411|stop:4911
0] * (max_seq_len + 1)for l in sequence_lengths:seq_len_distribution[l] += 1fig 
= plt.figure(figsize = (16, 12))plt.rcParams.update({'axes.linewidth' : 0.9})plt
.rc('grid', color='0.50', linestyle='-', linewidth=0.1)gs = gridspec.GridSpec(10
, 1)ax1 = plt.subplot(gs[0:8])plt.grid(True)plt.subplots_adjust(left=0.05, botto
m = 0.03, top = 0.95, right = 0.98)plt.plot(seq_len_distribution, color = 'black
', alpha = 0.3)plt.fill_between(range(0, max_seq_len + 1), seq_len_distribution,
 y2 = 0, color = 'bl
>temp_102|source:-*- coding: utf-8 -*-|start:1450|stop:1950
lf.fasta.id)self.sequences.append(self.fasta.seq)sys.stderr.write('\n')def close
(self):self.fasta.close()class SequenceSource():def __init__(self, fasta_file_pa
th, lazy_init = True, unique = False, allow_mixed_case = False):self.fasta_file_
path = fasta_file_pathself.name = Noneself.lazy_init = lazy_initself.allow_mixed
_case = allow_mixed_caseself.pos = 0self.id  = Noneself.seq = Noneself.ids = []s
elf.unique = uniqueself.unique_hash_dict = {}self.unique_hash_list = []self.uniq
ue_next_hash = 0self
>temp_103|source:-*- coding: utf-8 -*-|start:128|stop:628
he terms of the GNU General Public License as published by the Free# Software Fo
undation; either version 2 of the License, or (at your option)# any later versio
n.## Please read the docs/COPYING file.import sysimport numpyimport hashlibclass
 FastaOutput:def __init__(self, output_file_path):self.output_file_path = output
_file_pathself.output_file_obj = open(output_file_path, 'w')def store(self, entr
y, split = True, store_frequencies = True):if entry.unique and store_frequencies
:self.write_id('%s|%
>temp_104|source:-*- coding: utf-8 -*-|start:2655|stop:3155
True)]self.total_unique = len(self.unique_hash_dict)self.reset()def next(self):i
f self.unique:return self.next_unique()else:return self.next_regular()def next_u
nique(self):if self.unique:if self.total_unique > 0 and self.pos < self.total_un
ique:hash_entry = self.unique_hash_dict[self.unique_hash_list[self.pos]]self.pos
 += 1self.seq = hash_entry['seq'] if self.allow_mixed_case else hash_entry['seq'
].upper()self.id  = hash_entry['id']self.ids = hash_entry['ids']return Trueelse:
return Falseelse:ret
>temp_105|source:-*- coding: utf-8 -*-|start:3315|stop:3815
line:if len(sequence):self.seq = sequenceself.pos += 1return Trueelse:return Fal
seif line.startswith('>'):self.file_pointer.seek(self.file_pointer.tell() - len(
line))breaksequence += line.strip()self.seq = sequence if self.allow_mixed_case 
else sequence.upper()self.pos += 1return Truedef get_seq_by_read_id(self, read_i
d):self.reset()while self.next():if self.id == read_id:return self.seqreturn Fal
sedef close(self):self.file_pointer.close()def reset(self):self.pos = 0self.id  
= Noneself.seq = Non
>temp_106|source:-*- coding: utf-8 -*-|start:2099|stop:2599
file_pointer.readlines() if l.startswith('>')])self.reset()if self.unique:self.i
nit_unique_hash()def init_unique_hash(self):while self.next_regular():hash = has
hlib.sha1(self.seq.upper()).hexdigest()if hash in self.unique_hash_dict:self.uni
que_hash_dict[hash]['ids'].append(self.id)self.unique_hash_dict[hash]['count'] +
= 1else:self.unique_hash_dict[hash] = {'id' : self.id,'ids': [self.id],'seq': se
lf.seq,'count': 1}self.unique_hash_list = [i[1] for i in sorted([(self.unique_ha
sh_dict[hash]['count
>temp_107|source:-*- coding: utf-8 -*-|start:5129|stop:5629
nge(xtickstep, max_seq_len + 1, xtickstep), rotation=90, size='xx-small')plt.yti
cks(range(0, max(seq_len_distribution) + 1, ytickstep),[y for y in range(0, max(
seq_len_distribution) + 1, ytickstep)],size='xx-small')plt.xlim(xmin = 0, xmax =
 max_seq_len)plt.ylim(ymin = 0, ymax = max(seq_len_distribution) + (max(seq_len_
distribution) / 20.0))plt.figtext(0.5, 0.96, '%s' % (title), weight = 'black', s
ize = 'xx-large', ha = 'center')ax1 = plt.subplot(gs[9])plt.rcParams.update({'ax
es.edgecolor' : 20})
>temp_108|source:-*- coding: utf-8 -*-|start:5352|stop:5852
(xmin = 0, xmax = max_seq_len)plt.ylim(ymin = 0, ymax = max(seq_len_distribution
) + (max(seq_len_distribution) / 20.0))plt.figtext(0.5, 0.96, '%s' % (title), we
ight = 'black', size = 'xx-large', ha = 'center')ax1 = plt.subplot(gs[9])plt.rcP
arams.update({'axes.edgecolor' : 20})plt.grid(False)plt.yticks([])plt.xticks([])
plt.text(0.02, 0.5, 'total: %s / mean: %.2f / std: %.2f / min: %s / max: %s'\% (
len(sequence_lengths),numpy.mean(sequence_lengths), numpy.std(sequence_lengths),
\min(sequence_length
>temp_109|source:-*- coding: utf-8 -*-|start:4747|stop:5247
0.95, right = 0.98)plt.plot(seq_len_distribution, color = 'black', alpha = 0.3)p
lt.fill_between(range(0, max_seq_len + 1), seq_len_distribution, y2 = 0, color =
 'black', alpha = 0.15)plt.ylabel('number of sequences')plt.xlabel('sequence len
gth')if xtickstep == None:xtickstep = (max_seq_len / 50) or 1if ytickstep == Non
e:ytickstep = max(seq_len_distribution) / 20 or 1plt.xticks(range(xtickstep, max
_seq_len + 1, xtickstep), rotation=90, size='xx-small')plt.yticks(range(0, max(s
eq_len_distribution)
>temp_110|source:-*- coding: utf-8 -*-|start:2658|stop:3158
e)]self.total_unique = len(self.unique_hash_dict)self.reset()def next(self):if s
elf.unique:return self.next_unique()else:return self.next_regular()def next_uniq
ue(self):if self.unique:if self.total_unique > 0 and self.pos < self.total_uniqu
e:hash_entry = self.unique_hash_dict[self.unique_hash_list[self.pos]]self.pos +=
 1self.seq = hash_entry['seq'] if self.allow_mixed_case else hash_entry['seq'].u
pper()self.id  = hash_entry['id']self.ids = hash_entry['ids']return Trueelse:ret
urn Falseelse:return
>temp_111|source:-*- coding: utf-8 -*-|start:6863|stop:7363
er.seek(self.file_pointer.tell() - len(line))breakqualscores += ' ' + line.strip
()self.quals = qualscores.strip()self.quals_int = [int(q) for q in self.quals.sp
lit()]self.pos += 1return Truedef close(self):self.file_pointer.close()def reset
(self):self.pos = 0self.id  = Noneself.quals = Noneself.quals_int = Noneself.ids
 = []self.file_pointer.seek(0)if __name__ == '__main__':fasta = SequenceSource(s
ys.argv[1])fasta.visualize_sequence_length_distribution(title = sys.argv[2] if l
en(sys.argv) == 3 el
>temp_112|source:-*- coding: utf-8 -*-|start:4508|stop:5008
ize = (16, 12))plt.rcParams.update({'axes.linewidth' : 0.9})plt.rc('grid', color
='0.50', linestyle='-', linewidth=0.1)gs = gridspec.GridSpec(10, 1)ax1 = plt.sub
plot(gs[0:8])plt.grid(True)plt.subplots_adjust(left=0.05, bottom = 0.03, top = 0
.95, right = 0.98)plt.plot(seq_len_distribution, color = 'black', alpha = 0.3)pl
t.fill_between(range(0, max_seq_len + 1), seq_len_distribution, y2 = 0, color = 
'black', alpha = 0.15)plt.ylabel('number of sequences')plt.xlabel('sequence leng
th')if xtickstep == 
>temp_113|source:-*- coding: utf-8 -*-|start:876|stop:1376
tput_file_obj.write('%s\n' % seq)def split(self, sequence, piece_length = 80):ti
cks = range(0, len(sequence), piece_length) + [len(sequence)]return '\n'.join([s
equence[ticks[x]:ticks[x + 1]] for x in range(0, len(ticks) - 1)])def close(self
):self.output_file_obj.close()class ReadFasta:def __init__(self, f_name):self.id
s = []self.sequences = []self.fasta = SequenceSource(f_name)while self.fasta.nex
t():if self.fasta.pos % 1000 == 0 or self.fasta.pos == 1:sys.stderr.write('\r[fa
stalib] Reading FAST
>temp_114|source:-*- coding: utf-8 -*-|start:2772|stop:3272
()else:return self.next_regular()def next_unique(self):if self.unique:if self.to
tal_unique > 0 and self.pos < self.total_unique:hash_entry = self.unique_hash_di
ct[self.unique_hash_list[self.pos]]self.pos += 1self.seq = hash_entry['seq'] if 
self.allow_mixed_case else hash_entry['seq'].upper()self.id  = hash_entry['id']s
elf.ids = hash_entry['ids']return Trueelse:return Falseelse:return Falsedef next
_regular(self):self.seq = Noneself.id = self.file_pointer.readline()[1:].strip()
sequence = ''while 1
>temp_115|source:-*- coding: utf-8 -*-|start:6590|stop:7090
elf.quals_int = Nonequalscores = ''while 1:line = self.file_pointer.readline()if
 not line:if len(qualscores):self.quals = qualscores.strip()self.quals_int = [in
t(q) for q in self.quals.split()]self.pos += 1return Trueelse:return Falseif lin
e.startswith('>'):self.file_pointer.seek(self.file_pointer.tell() - len(line))br
eakqualscores += ' ' + line.strip()self.quals = qualscores.strip()self.quals_int
 = [int(q) for q in self.quals.split()]self.pos += 1return Truedef close(self):s
elf.file_pointer.clo
>temp_116|source:-*- coding: utf-8 -*-|start:5066|stop:5566
ne:ytickstep = max(seq_len_distribution) / 20 or 1plt.xticks(range(xtickstep, ma
x_seq_len + 1, xtickstep), rotation=90, size='xx-small')plt.yticks(range(0, max(
seq_len_distribution) + 1, ytickstep),[y for y in range(0, max(seq_len_distribut
ion) + 1, ytickstep)],size='xx-small')plt.xlim(xmin = 0, xmax = max_seq_len)plt.
ylim(ymin = 0, ymax = max(seq_len_distribution) + (max(seq_len_distribution) / 2
0.0))plt.figtext(0.5, 0.96, '%s' % (title), weight = 'black', size = 'xx-large',
 ha = 'center')ax1 =
>temp_117|source:-*- coding: utf-8 -*-|start:2656|stop:3156
rue)]self.total_unique = len(self.unique_hash_dict)self.reset()def next(self):if
 self.unique:return self.next_unique()else:return self.next_regular()def next_un
ique(self):if self.unique:if self.total_unique > 0 and self.pos < self.total_uni
que:hash_entry = self.unique_hash_dict[self.unique_hash_list[self.pos]]self.pos 
+= 1self.seq = hash_entry['seq'] if self.allow_mixed_case else hash_entry['seq']
.upper()self.id  = hash_entry['id']self.ids = hash_entry['ids']return Trueelse:r
eturn Falseelse:retu
>temp_118|source:-*- coding: utf-8 -*-|start:2353|stop:2853
hash]['ids'].append(self.id)self.unique_hash_dict[hash]['count'] += 1else:self.u
nique_hash_dict[hash] = {'id' : self.id,'ids': [self.id],'seq': self.seq,'count'
: 1}self.unique_hash_list = [i[1] for i in sorted([(self.unique_hash_dict[hash][
'count'], hash)\for hash in self.unique_hash_dict], reverse = True)]self.total_u
nique = len(self.unique_hash_dict)self.reset()def next(self):if self.unique:retu
rn self.next_unique()else:return self.next_regular()def next_unique(self):if sel
f.unique:if self.tot
>temp_119|source:-*- coding: utf-8 -*-|start:2583|stop:3083
ict[hash]['count'], hash)\for hash in self.unique_hash_dict], reverse = True)]se
lf.total_unique = len(self.unique_hash_dict)self.reset()def next(self):if self.u
nique:return self.next_unique()else:return self.next_regular()def next_unique(se
lf):if self.unique:if self.total_unique > 0 and self.pos < self.total_unique:has
h_entry = self.unique_hash_dict[self.unique_hash_list[self.pos]]self.pos += 1sel
f.seq = hash_entry['seq'] if self.allow_mixed_case else hash_entry['seq'].upper(
)self.id  = hash_ent
>temp_120|source:-*- coding: utf-8 -*-|start:2897|stop:3397
ue:hash_entry = self.unique_hash_dict[self.unique_hash_list[self.pos]]self.pos +
= 1self.seq = hash_entry['seq'] if self.allow_mixed_case else hash_entry['seq'].
upper()self.id  = hash_entry['id']self.ids = hash_entry['ids']return Trueelse:re
turn Falseelse:return Falsedef next_regular(self):self.seq = Noneself.id = self.
file_pointer.readline()[1:].strip()sequence = ''while 1:line = self.file_pointer
.readline()if not line:if len(sequence):self.seq = sequenceself.pos += 1return T
rueelse:return False
>temp_121|source:-*- coding: utf-8 -*-|start:5951|stop:6451
f.fasta_file_pathtry:plt.savefig(dest + '.pdf')except:plt.savefig(dest + '.png')
try:plt.show()except:passreturnclass QualSource:def __init__(self, quals_file_pa
th, lazy_init = True):self.quals_file_path = quals_file_pathself.name = Noneself
.lazy_init = lazy_initself.pos = 0self.id  = Noneself.quals = Noneself.quals_int
 = Noneself.ids = []self.file_pointer = open(self.quals_file_path)self.file_poin
ter.seek(0)if self.lazy_init:self.total_quals = Noneelse:self.total_quals = len(
[l for l in self.fil
>temp_122|source:-*- coding: utf-8 -*-|start:461|stop:961
f.output_file_obj = open(output_file_path, 'w')def store(self, entry, split = Tr
ue, store_frequencies = True):if entry.unique and store_frequencies:self.write_i
d('%s|%s' % (entry.id, 'frequency:%d' % len(entry.ids)))else:self.write_id(entry
.id)self.write_seq(entry.seq, split)def write_id(self, id):self.output_file_obj.
write('>%s\n' % id)def write_seq(self, seq, split = True):if split:seq = self.sp
lit(seq)self.output_file_obj.write('%s\n' % seq)def split(self, sequence, piece_
length = 80):ticks =
>temp_123|source:-*- coding: utf-8 -*-|start:2373|stop:2873
self.id)self.unique_hash_dict[hash]['count'] += 1else:self.unique_hash_dict[hash
] = {'id' : self.id,'ids': [self.id],'seq': self.seq,'count': 1}self.unique_hash
_list = [i[1] for i in sorted([(self.unique_hash_dict[hash]['count'], hash)\for 
hash in self.unique_hash_dict], reverse = True)]self.total_unique = len(self.uni
que_hash_dict)self.reset()def next(self):if self.unique:return self.next_unique(
)else:return self.next_regular()def next_unique(self):if self.unique:if self.tot
al_unique > 0 and se
>temp_124|source:-*- coding: utf-8 -*-|start:528|stop:1028
y, split = True, store_frequencies = True):if entry.unique and store_frequencies
:self.write_id('%s|%s' % (entry.id, 'frequency:%d' % len(entry.ids)))else:self.w
rite_id(entry.id)self.write_seq(entry.seq, split)def write_id(self, id):self.out
put_file_obj.write('>%s\n' % id)def write_seq(self, seq, split = True):if split:
seq = self.split(seq)self.output_file_obj.write('%s\n' % seq)def split(self, seq
uence, piece_length = 80):ticks = range(0, len(sequence), piece_length) + [len(s
equence)]return '\n'
>temp_125|source:-*- coding: utf-8 -*-|start:3101|stop:3601
 hash_entry['ids']return Trueelse:return Falseelse:return Falsedef next_regular(
self):self.seq = Noneself.id = self.file_pointer.readline()[1:].strip()sequence 
= ''while 1:line = self.file_pointer.readline()if not line:if len(sequence):self
.seq = sequenceself.pos += 1return Trueelse:return Falseif line.startswith('>'):
self.file_pointer.seek(self.file_pointer.tell() - len(line))breaksequence += lin
e.strip()self.seq = sequence if self.allow_mixed_case else sequence.upper()self.
pos += 1return Trued
>temp_126|source:-*- coding: utf-8 -*-|start:4870|stop:5370
seq_len_distribution, y2 = 0, color = 'black', alpha = 0.15)plt.ylabel('number o
f sequences')plt.xlabel('sequence length')if xtickstep == None:xtickstep = (max_
seq_len / 50) or 1if ytickstep == None:ytickstep = max(seq_len_distribution) / 2
0 or 1plt.xticks(range(xtickstep, max_seq_len + 1, xtickstep), rotation=90, size
='xx-small')plt.yticks(range(0, max(seq_len_distribution) + 1, ytickstep),[y for
 y in range(0, max(seq_len_distribution) + 1, ytickstep)],size='xx-small')plt.xl
im(xmin = 0, xmax = 
>temp_127|source:-*- coding: utf-8 -*-|start:2284|stop:2784
)).hexdigest()if hash in self.unique_hash_dict:self.unique_hash_dict[hash]['ids'
].append(self.id)self.unique_hash_dict[hash]['count'] += 1else:self.unique_hash_
dict[hash] = {'id' : self.id,'ids': [self.id],'seq': self.seq,'count': 1}self.un
ique_hash_list = [i[1] for i in sorted([(self.unique_hash_dict[hash]['count'], h
ash)\for hash in self.unique_hash_dict], reverse = True)]self.total_unique = len
(self.unique_hash_dict)self.reset()def next(self):if self.unique:return self.nex
t_unique()else:retur
>temp_128|source:-*- coding: utf-8 -*-|start:1790|stop:2290
caseself.pos = 0self.id  = Noneself.seq = Noneself.ids = []self.unique = uniques
elf.unique_hash_dict = {}self.unique_hash_list = []self.unique_next_hash = 0self
.file_pointer = open(self.fasta_file_path)self.file_pointer.seek(0)if self.lazy_
init:self.total_seq = Noneelse:self.total_seq = len([l for l in self.file_pointe
r.readlines() if l.startswith('>')])self.reset()if self.unique:self.init_unique_
hash()def init_unique_hash(self):while self.next_regular():hash = hashlib.sha1(s
elf.seq.upper()).hex
>temp_129|source:-*- coding: utf-8 -*-|start:4215|stop:4715
)sequence_lengths.append(len(self.seq))self.reset()sys.stderr.write('\n')if not 
max_seq_len:max_seq_len = max(sequence_lengths) + (int(max(sequence_lengths) / 1
00.0) or 10)seq_len_distribution = [0] * (max_seq_len + 1)for l in sequence_leng
ths:seq_len_distribution[l] += 1fig = plt.figure(figsize = (16, 12))plt.rcParams
.update({'axes.linewidth' : 0.9})plt.rc('grid', color='0.50', linestyle='-', lin
ewidth=0.1)gs = gridspec.GridSpec(10, 1)ax1 = plt.subplot(gs[0:8])plt.grid(True)
plt.subplots_adjust(
>temp_130|source:-*- coding: utf-8 -*-|start:2676|stop:3176
ue = len(self.unique_hash_dict)self.reset()def next(self):if self.unique:return 
self.next_unique()else:return self.next_regular()def next_unique(self):if self.u
nique:if self.total_unique > 0 and self.pos < self.total_unique:hash_entry = sel
f.unique_hash_dict[self.unique_hash_list[self.pos]]self.pos += 1self.seq = hash_
entry['seq'] if self.allow_mixed_case else hash_entry['seq'].upper()self.id  = h
ash_entry['id']self.ids = hash_entry['ids']return Trueelse:return Falseelse:retu
rn Falsedef next_reg
>temp_131|source:-*- coding: utf-8 -*-|start:4573|stop:5073
c('grid', color='0.50', linestyle='-', linewidth=0.1)gs = gridspec.GridSpec(10, 
1)ax1 = plt.subplot(gs[0:8])plt.grid(True)plt.subplots_adjust(left=0.05, bottom 
= 0.03, top = 0.95, right = 0.98)plt.plot(seq_len_distribution, color = 'black',
 alpha = 0.3)plt.fill_between(range(0, max_seq_len + 1), seq_len_distribution, y
2 = 0, color = 'black', alpha = 0.15)plt.ylabel('number of sequences')plt.xlabel
('sequence length')if xtickstep == None:xtickstep = (max_seq_len / 50) or 1if yt
ickstep == None:ytic
>temp_132|source:-*- coding: utf-8 -*-|start:213|stop:713
ion; either version 2 of the License, or (at your option)# any later version.## 
Please read the docs/COPYING file.import sysimport numpyimport hashlibclass Fast
aOutput:def __init__(self, output_file_path):self.output_file_path = output_file
_pathself.output_file_obj = open(output_file_path, 'w')def store(self, entry, sp
lit = True, store_frequencies = True):if entry.unique and store_frequencies:self
.write_id('%s|%s' % (entry.id, 'frequency:%d' % len(entry.ids)))else:self.write_
id(entry.id)self.wri
>temp_133|source:-*- coding: utf-8 -*-|start:3609|stop:4109
eq_by_read_id(self, read_id):self.reset()while self.next():if self.id == read_id
:return self.seqreturn Falsedef close(self):self.file_pointer.close()def reset(s
elf):self.pos = 0self.id  = Noneself.seq = Noneself.ids = []self.file_pointer.se
ek(0)def visualize_sequence_length_distribution(self, title, dest = None, max_se
q_len = None, xtickstep = None, ytickstep = None):import matplotlib.pyplot as pl
timport matplotlib.gridspec as gridspecsequence_lengths = []self.reset()while se
lf.next():if self.po
>temp_134|source:-*- coding: utf-8 -*-|start:2984|stop:3484
.seq = hash_entry['seq'] if self.allow_mixed_case else hash_entry['seq'].upper()
self.id  = hash_entry['id']self.ids = hash_entry['ids']return Trueelse:return Fa
lseelse:return Falsedef next_regular(self):self.seq = Noneself.id = self.file_po
inter.readline()[1:].strip()sequence = ''while 1:line = self.file_pointer.readli
ne()if not line:if len(sequence):self.seq = sequenceself.pos += 1return Trueelse
:return Falseif line.startswith('>'):self.file_pointer.seek(self.file_pointer.te
ll() - len(line))bre
>temp_135|source:-*- coding: utf-8 -*-|start:6473|stop:6973
if l.startswith('>')])self.reset()def next(self):self.id = self.file_pointer.rea
dline()[1:].strip()self.quals = Noneself.quals_int = Nonequalscores = ''while 1:
line = self.file_pointer.readline()if not line:if len(qualscores):self.quals = q
ualscores.strip()self.quals_int = [int(q) for q in self.quals.split()]self.pos +
= 1return Trueelse:return Falseif line.startswith('>'):self.file_pointer.seek(se
lf.file_pointer.tell() - len(line))breakqualscores += ' ' + line.strip()self.qua
ls = qualscores.stri
>temp_136|source:-*- coding: utf-8 -*-|start:2495|stop:2995
: self.seq,'count': 1}self.unique_hash_list = [i[1] for i in sorted([(self.uniqu
e_hash_dict[hash]['count'], hash)\for hash in self.unique_hash_dict], reverse = 
True)]self.total_unique = len(self.unique_hash_dict)self.reset()def next(self):i
f self.unique:return self.next_unique()else:return self.next_regular()def next_u
nique(self):if self.unique:if self.total_unique > 0 and self.pos < self.total_un
ique:hash_entry = self.unique_hash_dict[self.unique_hash_list[self.pos]]self.pos
 += 1self.seq = hash
>temp_137|source:-*- coding: utf-8 -*-|start:950|stop:1450
80):ticks = range(0, len(sequence), piece_length) + [len(sequence)]return '\n'.j
oin([sequence[ticks[x]:ticks[x + 1]] for x in range(0, len(ticks) - 1)])def clos
e(self):self.output_file_obj.close()class ReadFasta:def __init__(self, f_name):s
elf.ids = []self.sequences = []self.fasta = SequenceSource(f_name)while self.fas
ta.next():if self.fasta.pos % 1000 == 0 or self.fasta.pos == 1:sys.stderr.write(
'\r[fastalib] Reading FASTA into memory: %s' % (self.fasta.pos))sys.stderr.flush
()self.ids.append(se
>temp_138|source:-*- coding: utf-8 -*-|start:6591|stop:7091
lf.quals_int = Nonequalscores = ''while 1:line = self.file_pointer.readline()if 
not line:if len(qualscores):self.quals = qualscores.strip()self.quals_int = [int
(q) for q in self.quals.split()]self.pos += 1return Trueelse:return Falseif line
.startswith('>'):self.file_pointer.seek(self.file_pointer.tell() - len(line))bre
akqualscores += ' ' + line.strip()self.quals = qualscores.strip()self.quals_int 
= [int(q) for q in self.quals.split()]self.pos += 1return Truedef close(self):se
lf.file_pointer.clos
>temp_139|source:-*- coding: utf-8 -*-|start:6745|stop:7245
= [int(q) for q in self.quals.split()]self.pos += 1return Trueelse:return Falsei
f line.startswith('>'):self.file_pointer.seek(self.file_pointer.tell() - len(lin
e))breakqualscores += ' ' + line.strip()self.quals = qualscores.strip()self.qual
s_int = [int(q) for q in self.quals.split()]self.pos += 1return Truedef close(se
lf):self.file_pointer.close()def reset(self):self.pos = 0self.id  = Noneself.qua
ls = Noneself.quals_int = Noneself.ids = []self.file_pointer.seek(0)if __name__ 
== '__main__':fasta 
>temp_140|source:-*- coding: utf-8 -*-|start:923|stop:1423
, sequence, piece_length = 80):ticks = range(0, len(sequence), piece_length) + [
len(sequence)]return '\n'.join([sequence[ticks[x]:ticks[x + 1]] for x in range(0
, len(ticks) - 1)])def close(self):self.output_file_obj.close()class ReadFasta:d
ef __init__(self, f_name):self.ids = []self.sequences = []self.fasta = SequenceS
ource(f_name)while self.fasta.next():if self.fasta.pos % 1000 == 0 or self.fasta
.pos == 1:sys.stderr.write('\r[fastalib] Reading FASTA into memory: %s' % (self.
fasta.pos))sys.stder
>temp_141|source:-*- coding: utf-8 -*-|start:2534|stop:3034
list = [i[1] for i in sorted([(self.unique_hash_dict[hash]['count'], hash)\for h
ash in self.unique_hash_dict], reverse = True)]self.total_unique = len(self.uniq
ue_hash_dict)self.reset()def next(self):if self.unique:return self.next_unique()
else:return self.next_regular()def next_unique(self):if self.unique:if self.tota
l_unique > 0 and self.pos < self.total_unique:hash_entry = self.unique_hash_dict
[self.unique_hash_list[self.pos]]self.pos += 1self.seq = hash_entry['seq'] if se
lf.allow_mixed_case 
>temp_142|source:-*- coding: utf-8 -*-|start:4143|stop:4643
s.stderr.write('\r[fastalib] Reading: %s' % (self.pos))sys.stderr.flush()sequenc
e_lengths.append(len(self.seq))self.reset()sys.stderr.write('\n')if not max_seq_
len:max_seq_len = max(sequence_lengths) + (int(max(sequence_lengths) / 100.0) or
 10)seq_len_distribution = [0] * (max_seq_len + 1)for l in sequence_lengths:seq_
len_distribution[l] += 1fig = plt.figure(figsize = (16, 12))plt.rcParams.update(
{'axes.linewidth' : 0.9})plt.rc('grid', color='0.50', linestyle='-', linewidth=0
.1)gs = gridspec.Gri
>temp_143|source:-*- coding: utf-8 -*-|start:2016|stop:2516
)if self.lazy_init:self.total_seq = Noneelse:self.total_seq = len([l for l in se
lf.file_pointer.readlines() if l.startswith('>')])self.reset()if self.unique:sel
f.init_unique_hash()def init_unique_hash(self):while self.next_regular():hash = 
hashlib.sha1(self.seq.upper()).hexdigest()if hash in self.unique_hash_dict:self.
unique_hash_dict[hash]['ids'].append(self.id)self.unique_hash_dict[hash]['count'
] += 1else:self.unique_hash_dict[hash] = {'id' : self.id,'ids': [self.id],'seq':
 self.seq,'count': 1
>temp_144|source:-*- coding: utf-8 -*-|start:843|stop:1343
plit:seq = self.split(seq)self.output_file_obj.write('%s\n' % seq)def split(self
, sequence, piece_length = 80):ticks = range(0, len(sequence), piece_length) + [
len(sequence)]return '\n'.join([sequence[ticks[x]:ticks[x + 1]] for x in range(0
, len(ticks) - 1)])def close(self):self.output_file_obj.close()class ReadFasta:d
ef __init__(self, f_name):self.ids = []self.sequences = []self.fasta = SequenceS
ource(f_name)while self.fasta.next():if self.fasta.pos % 1000 == 0 or self.fasta
.pos == 1:sys.stderr
>temp_145|source:-*- coding: utf-8 -*-|start:859|stop:1359
split(seq)self.output_file_obj.write('%s\n' % seq)def split(self, sequence, piec
e_length = 80):ticks = range(0, len(sequence), piece_length) + [len(sequence)]re
turn '\n'.join([sequence[ticks[x]:ticks[x + 1]] for x in range(0, len(ticks) - 1
)])def close(self):self.output_file_obj.close()class ReadFasta:def __init__(self
, f_name):self.ids = []self.sequences = []self.fasta = SequenceSource(f_name)whi
le self.fasta.next():if self.fasta.pos % 1000 == 0 or self.fasta.pos == 1:sys.st
derr.write('\r[fasta
>temp_146|source:-*- coding: utf-8 -*-|start:5782|stop:6282
an(sequence_lengths), numpy.std(sequence_lengths),\min(sequence_lengths),\max(se
quence_lengths)),\va = 'center', alpha = 0.8, size = 'x-large')if dest == None:d
est = self.fasta_file_pathtry:plt.savefig(dest + '.pdf')except:plt.savefig(dest 
+ '.png')try:plt.show()except:passreturnclass QualSource:def __init__(self, qual
s_file_path, lazy_init = True):self.quals_file_path = quals_file_pathself.name =
 Noneself.lazy_init = lazy_initself.pos = 0self.id  = Noneself.quals = Noneself.
quals_int = Noneself
>temp_147|source:-*- coding: utf-8 -*-|start:905|stop:1405
seq)def split(self, sequence, piece_length = 80):ticks = range(0, len(sequence),
 piece_length) + [len(sequence)]return '\n'.join([sequence[ticks[x]:ticks[x + 1]
] for x in range(0, len(ticks) - 1)])def close(self):self.output_file_obj.close(
)class ReadFasta:def __init__(self, f_name):self.ids = []self.sequences = []self
.fasta = SequenceSource(f_name)while self.fasta.next():if self.fasta.pos % 1000 
== 0 or self.fasta.pos == 1:sys.stderr.write('\r[fastalib] Reading FASTA into me
mory: %s' % (self.fa
>temp_148|source:-*- coding: utf-8 -*-|start:5087|stop:5587
q_len_distribution) / 20 or 1plt.xticks(range(xtickstep, max_seq_len + 1, xticks
tep), rotation=90, size='xx-small')plt.yticks(range(0, max(seq_len_distribution)
 + 1, ytickstep),[y for y in range(0, max(seq_len_distribution) + 1, ytickstep)]
,size='xx-small')plt.xlim(xmin = 0, xmax = max_seq_len)plt.ylim(ymin = 0, ymax =
 max(seq_len_distribution) + (max(seq_len_distribution) / 20.0))plt.figtext(0.5,
 0.96, '%s' % (title), weight = 'black', size = 'xx-large', ha = 'center')ax1 = 
plt.subplot(gs[9])pl
>temp_149|source:-*- coding: utf-8 -*-|start:6455|stop:6955
inter.readlines() if l.startswith('>')])self.reset()def next(self):self.id = sel
f.file_pointer.readline()[1:].strip()self.quals = Noneself.quals_int = Nonequals
cores = ''while 1:line = self.file_pointer.readline()if not line:if len(qualscor
es):self.quals = qualscores.strip()self.quals_int = [int(q) for q in self.quals.
split()]self.pos += 1return Trueelse:return Falseif line.startswith('>'):self.fi
le_pointer.seek(self.file_pointer.tell() - len(line))breakqualscores += ' ' + li
ne.strip()self.quals
>temp_150|source:-*- coding: utf-8 -*-|start:3736|stop:4236
f.file_pointer.close()def reset(self):self.pos = 0self.id  = Noneself.seq = None
self.ids = []self.file_pointer.seek(0)def visualize_sequence_length_distribution
(self, title, dest = None, max_seq_len = None, xtickstep = None, ytickstep = Non
e):import matplotlib.pyplot as pltimport matplotlib.gridspec as gridspecsequence
_lengths = []self.reset()while self.next():if self.pos % 10000 == 0 or self.pos 
== 1:sys.stderr.write('\r[fastalib] Reading: %s' % (self.pos))sys.stderr.flush()
sequence_lengths.app
>temp_151|source:-*- coding: utf-8 -*-|start:1320|stop:1820
sta.pos == 1:sys.stderr.write('\r[fastalib] Reading FASTA into memory: %s' % (se
lf.fasta.pos))sys.stderr.flush()self.ids.append(self.fasta.id)self.sequences.app
end(self.fasta.seq)sys.stderr.write('\n')def close(self):self.fasta.close()class
 SequenceSource():def __init__(self, fasta_file_path, lazy_init = True, unique =
 False, allow_mixed_case = False):self.fasta_file_path = fasta_file_pathself.nam
e = Noneself.lazy_init = lazy_initself.allow_mixed_case = allow_mixed_caseself.p
os = 0self.id  = Non
>temp_152|source:-*- coding: utf-8 -*-|start:3090|stop:3590
]self.ids = hash_entry['ids']return Trueelse:return Falseelse:return Falsedef ne
xt_regular(self):self.seq = Noneself.id = self.file_pointer.readline()[1:].strip
()sequence = ''while 1:line = self.file_pointer.readline()if not line:if len(seq
uence):self.seq = sequenceself.pos += 1return Trueelse:return Falseif line.start
swith('>'):self.file_pointer.seek(self.file_pointer.tell() - len(line))breaksequ
ence += line.strip()self.seq = sequence if self.allow_mixed_case else sequence.u
pper()self.pos += 1r
>temp_153|source:-*- coding: utf-8 -*-|start:1412|stop:1912
))sys.stderr.flush()self.ids.append(self.fasta.id)self.sequences.append(self.fas
ta.seq)sys.stderr.write('\n')def close(self):self.fasta.close()class SequenceSou
rce():def __init__(self, fasta_file_path, lazy_init = True, unique = False, allo
w_mixed_case = False):self.fasta_file_path = fasta_file_pathself.name = Noneself
.lazy_init = lazy_initself.allow_mixed_case = allow_mixed_caseself.pos = 0self.i
d  = Noneself.seq = Noneself.ids = []self.unique = uniqueself.unique_hash_dict =
 {}self.unique_hash_
>temp_154|source:-*- coding: utf-8 -*-|start:4384|stop:4884
10)seq_len_distribution = [0] * (max_seq_len + 1)for l in sequence_lengths:seq_l
en_distribution[l] += 1fig = plt.figure(figsize = (16, 12))plt.rcParams.update({
'axes.linewidth' : 0.9})plt.rc('grid', color='0.50', linestyle='-', linewidth=0.
1)gs = gridspec.GridSpec(10, 1)ax1 = plt.subplot(gs[0:8])plt.grid(True)plt.subpl
ots_adjust(left=0.05, bottom = 0.03, top = 0.95, right = 0.98)plt.plot(seq_len_d
istribution, color = 'black', alpha = 0.3)plt.fill_between(range(0, max_seq_len 
+ 1), seq_len_distri
>temp_155|source:-*- coding: utf-8 -*-|start:1657|stop:2157
ed_case = False):self.fasta_file_path = fasta_file_pathself.name = Noneself.lazy
_init = lazy_initself.allow_mixed_case = allow_mixed_caseself.pos = 0self.id  = 
Noneself.seq = Noneself.ids = []self.unique = uniqueself.unique_hash_dict = {}se
lf.unique_hash_list = []self.unique_next_hash = 0self.file_pointer = open(self.f
asta_file_path)self.file_pointer.seek(0)if self.lazy_init:self.total_seq = Nonee
lse:self.total_seq = len([l for l in self.file_pointer.readlines() if l.startswi
th('>')])self.reset(
>temp_156|source:-*- coding: utf-8 -*-|start:2023|stop:2523
f.lazy_init:self.total_seq = Noneelse:self.total_seq = len([l for l in self.file
_pointer.readlines() if l.startswith('>')])self.reset()if self.unique:self.init_
unique_hash()def init_unique_hash(self):while self.next_regular():hash = hashlib
.sha1(self.seq.upper()).hexdigest()if hash in self.unique_hash_dict:self.unique_
hash_dict[hash]['ids'].append(self.id)self.unique_hash_dict[hash]['count'] += 1e
lse:self.unique_hash_dict[hash] = {'id' : self.id,'ids': [self.id],'seq': self.s
eq,'count': 1}self.u
>temp_157|source:-*- coding: utf-8 -*-|start:3435|stop:3935
ter.seek(self.file_pointer.tell() - len(line))breaksequence += line.strip()self.
seq = sequence if self.allow_mixed_case else sequence.upper()self.pos += 1return
 Truedef get_seq_by_read_id(self, read_id):self.reset()while self.next():if self
.id == read_id:return self.seqreturn Falsedef close(self):self.file_pointer.clos
e()def reset(self):self.pos = 0self.id  = Noneself.seq = Noneself.ids = []self.f
ile_pointer.seek(0)def visualize_sequence_length_distribution(self, title, dest 
= None, max_seq_len 
>temp_158|source:-*- coding: utf-8 -*-|start:3141|stop:3641
 Falseelse:return Falsedef next_regular(self):self.seq = Noneself.id = self.file
_pointer.readline()[1:].strip()sequence = ''while 1:line = self.file_pointer.rea
dline()if not line:if len(sequence):self.seq = sequenceself.pos += 1return Truee
lse:return Falseif line.startswith('>'):self.file_pointer.seek(self.file_pointer
.tell() - len(line))breaksequence += line.strip()self.seq = sequence if self.all
ow_mixed_case else sequence.upper()self.pos += 1return Truedef get_seq_by_read_i
d(self, read_id):sel
>temp_159|source:-*- coding: utf-8 -*-|start:5641|stop:6141
se)plt.yticks([])plt.xticks([])plt.text(0.02, 0.5, 'total: %s / mean: %.2f / std
: %.2f / min: %s / max: %s'\% (len(sequence_lengths),numpy.mean(sequence_lengths
), numpy.std(sequence_lengths),\min(sequence_lengths),\max(sequence_lengths)),\v
a = 'center', alpha = 0.8, size = 'x-large')if dest == None:dest = self.fasta_fi
le_pathtry:plt.savefig(dest + '.pdf')except:plt.savefig(dest + '.png')try:plt.sh
ow()except:passreturnclass QualSource:def __init__(self, quals_file_path, lazy_i
nit = True):self.qua
>temp_160|source:-*- coding: utf-8 -*-|start:3629|stop:4129
read_id):self.reset()while self.next():if self.id == read_id:return self.seqretu
rn Falsedef close(self):self.file_pointer.close()def reset(self):self.pos = 0sel
f.id  = Noneself.seq = Noneself.ids = []self.file_pointer.seek(0)def visualize_s
equence_length_distribution(self, title, dest = None, max_seq_len = None, xticks
tep = None, ytickstep = None):import matplotlib.pyplot as pltimport matplotlib.g
ridspec as gridspecsequence_lengths = []self.reset()while self.next():if self.po
s % 10000 == 0 or se
>temp_161|source:-*- coding: utf-8 -*-|start:5016|stop:5516
ckstep = (max_seq_len / 50) or 1if ytickstep == None:ytickstep = max(seq_len_dis
tribution) / 20 or 1plt.xticks(range(xtickstep, max_seq_len + 1, xtickstep), rot
ation=90, size='xx-small')plt.yticks(range(0, max(seq_len_distribution) + 1, yti
ckstep),[y for y in range(0, max(seq_len_distribution) + 1, ytickstep)],size='xx
-small')plt.xlim(xmin = 0, xmax = max_seq_len)plt.ylim(ymin = 0, ymax = max(seq_
len_distribution) + (max(seq_len_distribution) / 20.0))plt.figtext(0.5, 0.96, '%
s' % (title), weight
>temp_162|source:-*- coding: utf-8 -*-|start:718|stop:1218
q(entry.seq, split)def write_id(self, id):self.output_file_obj.write('>%s\n' % i
d)def write_seq(self, seq, split = True):if split:seq = self.split(seq)self.outp
ut_file_obj.write('%s\n' % seq)def split(self, sequence, piece_length = 80):tick
s = range(0, len(sequence), piece_length) + [len(sequence)]return '\n'.join([seq
uence[ticks[x]:ticks[x + 1]] for x in range(0, len(ticks) - 1)])def close(self):
self.output_file_obj.close()class ReadFasta:def __init__(self, f_name):self.ids 
= []self.sequences =
>temp_163|source:-*- coding: utf-8 -*-|start:3408|stop:3908
rtswith('>'):self.file_pointer.seek(self.file_pointer.tell() - len(line))breakse
quence += line.strip()self.seq = sequence if self.allow_mixed_case else sequence
.upper()self.pos += 1return Truedef get_seq_by_read_id(self, read_id):self.reset
()while self.next():if self.id == read_id:return self.seqreturn Falsedef close(s
elf):self.file_pointer.close()def reset(self):self.pos = 0self.id  = Noneself.se
q = Noneself.ids = []self.file_pointer.seek(0)def visualize_sequence_length_dist
ribution(self, title
>temp_164|source:-*- coding: utf-8 -*-|start:1470|stop:1970
uences.append(self.fasta.seq)sys.stderr.write('\n')def close(self):self.fasta.cl
ose()class SequenceSource():def __init__(self, fasta_file_path, lazy_init = True
, unique = False, allow_mixed_case = False):self.fasta_file_path = fasta_file_pa
thself.name = Noneself.lazy_init = lazy_initself.allow_mixed_case = allow_mixed_
caseself.pos = 0self.id  = Noneself.seq = Noneself.ids = []self.unique = uniques
elf.unique_hash_dict = {}self.unique_hash_list = []self.unique_next_hash = 0self
.file_pointer = open
>temp_165|source:-*- coding: utf-8 -*-|start:4609|stop:5109
', linewidth=0.1)gs = gridspec.GridSpec(10, 1)ax1 = plt.subplot(gs[0:8])plt.grid
(True)plt.subplots_adjust(left=0.05, bottom = 0.03, top = 0.95, right = 0.98)plt
.plot(seq_len_distribution, color = 'black', alpha = 0.3)plt.fill_between(range(
0, max_seq_len + 1), seq_len_distribution, y2 = 0, color = 'black', alpha = 0.15
)plt.ylabel('number of sequences')plt.xlabel('sequence length')if xtickstep == N
one:xtickstep = (max_seq_len / 50) or 1if ytickstep == None:ytickstep = max(seq_
len_distribution) / 
>temp_166|source:-*- coding: utf-8 -*-|start:6141|stop:6641
ls_file_path = quals_file_pathself.name = Noneself.lazy_init = lazy_initself.pos
 = 0self.id  = Noneself.quals = Noneself.quals_int = Noneself.ids = []self.file_
pointer = open(self.quals_file_path)self.file_pointer.seek(0)if self.lazy_init:s
elf.total_quals = Noneelse:self.total_quals = len([l for l in self.file_pointer.
readlines() if l.startswith('>')])self.reset()def next(self):self.id = self.file
_pointer.readline()[1:].strip()self.quals = Noneself.quals_int = Nonequalscores 
= ''while 1:line = s
>temp_167|source:-*- coding: utf-8 -*-|start:6732|stop:7232
lf.quals_int = [int(q) for q in self.quals.split()]self.pos += 1return Trueelse:
return Falseif line.startswith('>'):self.file_pointer.seek(self.file_pointer.tel
l() - len(line))breakqualscores += ' ' + line.strip()self.quals = qualscores.str
ip()self.quals_int = [int(q) for q in self.quals.split()]self.pos += 1return Tru
edef close(self):self.file_pointer.close()def reset(self):self.pos = 0self.id  =
 Noneself.quals = Noneself.quals_int = Noneself.ids = []self.file_pointer.seek(0
)if __name__ == '__m
>temp_168|source:-*- coding: utf-8 -*-|start:3841|stop:4341
inter.seek(0)def visualize_sequence_length_distribution(self, title, dest = None
, max_seq_len = None, xtickstep = None, ytickstep = None):import matplotlib.pypl
ot as pltimport matplotlib.gridspec as gridspecsequence_lengths = []self.reset()
while self.next():if self.pos % 10000 == 0 or self.pos == 1:sys.stderr.write('\r
[fastalib] Reading: %s' % (self.pos))sys.stderr.flush()sequence_lengths.append(l
en(self.seq))self.reset()sys.stderr.write('\n')if not max_seq_len:max_seq_len = 
max(sequence_lengths
>temp_169|source:-*- coding: utf-8 -*-|start:187|stop:687
the Free# Software Foundation; either version 2 of the License, or (at your opti
on)# any later version.## Please read the docs/COPYING file.import sysimport num
pyimport hashlibclass FastaOutput:def __init__(self, output_file_path):self.outp
ut_file_path = output_file_pathself.output_file_obj = open(output_file_path, 'w'
)def store(self, entry, split = True, store_frequencies = True):if entry.unique 
and store_frequencies:self.write_id('%s|%s' % (entry.id, 'frequency:%d' % len(en
try.ids)))else:self.
>temp_170|source:-*- coding: utf-8 -*-|start:1779|stop:2279
llow_mixed_caseself.pos = 0self.id  = Noneself.seq = Noneself.ids = []self.uniqu
e = uniqueself.unique_hash_dict = {}self.unique_hash_list = []self.unique_next_h
ash = 0self.file_pointer = open(self.fasta_file_path)self.file_pointer.seek(0)if
 self.lazy_init:self.total_seq = Noneelse:self.total_seq = len([l for l in self.
file_pointer.readlines() if l.startswith('>')])self.reset()if self.unique:self.i
nit_unique_hash()def init_unique_hash(self):while self.next_regular():hash = has
hlib.sha1(self.seq.u
>temp_171|source:-*- coding: utf-8 -*-|start:2390|stop:2890
ue_hash_dict[hash]['count'] += 1else:self.unique_hash_dict[hash] = {'id' : self.
id,'ids': [self.id],'seq': self.seq,'count': 1}self.unique_hash_list = [i[1] for
 i in sorted([(self.unique_hash_dict[hash]['count'], hash)\for hash in self.uniq
ue_hash_dict], reverse = True)]self.total_unique = len(self.unique_hash_dict)sel
f.reset()def next(self):if self.unique:return self.next_unique()else:return self
.next_regular()def next_unique(self):if self.unique:if self.total_unique > 0 and
 self.pos < self.tot
>temp_172|source:-*- coding: utf-8 -*-|start:2928|stop:3428
h_dict[self.unique_hash_list[self.pos]]self.pos += 1self.seq = hash_entry['seq']
 if self.allow_mixed_case else hash_entry['seq'].upper()self.id  = hash_entry['i
d']self.ids = hash_entry['ids']return Trueelse:return Falseelse:return Falsedef 
next_regular(self):self.seq = Noneself.id = self.file_pointer.readline()[1:].str
ip()sequence = ''while 1:line = self.file_pointer.readline()if not line:if len(s
equence):self.seq = sequenceself.pos += 1return Trueelse:return Falseif line.sta
rtswith('>'):self.fi
>temp_173|source:-*- coding: utf-8 -*-|start:3727|stop:4227
self):self.file_pointer.close()def reset(self):self.pos = 0self.id  = Noneself.s
eq = Noneself.ids = []self.file_pointer.seek(0)def visualize_sequence_length_dis
tribution(self, title, dest = None, max_seq_len = None, xtickstep = None, yticks
tep = None):import matplotlib.pyplot as pltimport matplotlib.gridspec as gridspe
csequence_lengths = []self.reset()while self.next():if self.pos % 10000 == 0 or 
self.pos == 1:sys.stderr.write('\r[fastalib] Reading: %s' % (self.pos))sys.stder
r.flush()sequence_le
>temp_174|source:-*- coding: utf-8 -*-|start:16|stop:516
right (C) 2014, A. Murat Eren## This program is free software; you can redistrib
ute it and/or modify it under# the terms of the GNU General Public License as pu
blished by the Free# Software Foundation; either version 2 of the License, or (a
t your option)# any later version.## Please read the docs/COPYING file.import sy
simport numpyimport hashlibclass FastaOutput:def __init__(self, output_file_path
):self.output_file_path = output_file_pathself.output_file_obj = open(output_fil
e_path, 'w')def stor
>temp_175|source:-*- coding: utf-8 -*-|start:4478|stop:4978
n[l] += 1fig = plt.figure(figsize = (16, 12))plt.rcParams.update({'axes.linewidt
h' : 0.9})plt.rc('grid', color='0.50', linestyle='-', linewidth=0.1)gs = gridspe
c.GridSpec(10, 1)ax1 = plt.subplot(gs[0:8])plt.grid(True)plt.subplots_adjust(lef
t=0.05, bottom = 0.03, top = 0.95, right = 0.98)plt.plot(seq_len_distribution, c
olor = 'black', alpha = 0.3)plt.fill_between(range(0, max_seq_len + 1), seq_len_
distribution, y2 = 0, color = 'black', alpha = 0.15)plt.ylabel('number of sequen
ces')plt.xlabel('seq
>temp_176|source:-*- coding: utf-8 -*-|start:2816|stop:3316
ique(self):if self.unique:if self.total_unique > 0 and self.pos < self.total_uni
que:hash_entry = self.unique_hash_dict[self.unique_hash_list[self.pos]]self.pos 
+= 1self.seq = hash_entry['seq'] if self.allow_mixed_case else hash_entry['seq']
.upper()self.id  = hash_entry['id']self.ids = hash_entry['ids']return Trueelse:r
eturn Falseelse:return Falsedef next_regular(self):self.seq = Noneself.id = self
.file_pointer.readline()[1:].strip()sequence = ''while 1:line = self.file_pointe
r.readline()if not l
>temp_177|source:-*- coding: utf-8 -*-|start:3369|stop:3869
return Trueelse:return Falseif line.startswith('>'):self.file_pointer.seek(self.
file_pointer.tell() - len(line))breaksequence += line.strip()self.seq = sequence
 if self.allow_mixed_case else sequence.upper()self.pos += 1return Truedef get_s
eq_by_read_id(self, read_id):self.reset()while self.next():if self.id == read_id
:return self.seqreturn Falsedef close(self):self.file_pointer.close()def reset(s
elf):self.pos = 0self.id  = Noneself.seq = Noneself.ids = []self.file_pointer.se
ek(0)def visualize_s
>temp_178|source:-*- coding: utf-8 -*-|start:779|stop:1279
j.write('>%s\n' % id)def write_seq(self, seq, split = True):if split:seq = self.
split(seq)self.output_file_obj.write('%s\n' % seq)def split(self, sequence, piec
e_length = 80):ticks = range(0, len(sequence), piece_length) + [len(sequence)]re
turn '\n'.join([sequence[ticks[x]:ticks[x + 1]] for x in range(0, len(ticks) - 1
)])def close(self):self.output_file_obj.close()class ReadFasta:def __init__(self
, f_name):self.ids = []self.sequences = []self.fasta = SequenceSource(f_name)whi
le self.fasta.next()
>temp_179|source:-*- coding: utf-8 -*-|start:6596|stop:7096
als_int = Nonequalscores = ''while 1:line = self.file_pointer.readline()if not l
ine:if len(qualscores):self.quals = qualscores.strip()self.quals_int = [int(q) f
or q in self.quals.split()]self.pos += 1return Trueelse:return Falseif line.star
tswith('>'):self.file_pointer.seek(self.file_pointer.tell() - len(line))breakqua
lscores += ' ' + line.strip()self.quals = qualscores.strip()self.quals_int = [in
t(q) for q in self.quals.split()]self.pos += 1return Truedef close(self):self.fi
le_pointer.close()de
>temp_180|source:-*- coding: utf-8 -*-|start:5393|stop:5893
in = 0, ymax = max(seq_len_distribution) + (max(seq_len_distribution) / 20.0))pl
t.figtext(0.5, 0.96, '%s' % (title), weight = 'black', size = 'xx-large', ha = '
center')ax1 = plt.subplot(gs[9])plt.rcParams.update({'axes.edgecolor' : 20})plt.
grid(False)plt.yticks([])plt.xticks([])plt.text(0.02, 0.5, 'total: %s / mean: %.
2f / std: %.2f / min: %s / max: %s'\% (len(sequence_lengths),numpy.mean(sequence
_lengths), numpy.std(sequence_lengths),\min(sequence_lengths),\max(sequence_leng
ths)),\va = 'center'
>temp_181|source:-*- coding: utf-8 -*-|start:39|stop:539
t Eren## This program is free software; you can redistribute it and/or modify it
 under# the terms of the GNU General Public License as published by the Free# So
ftware Foundation; either version 2 of the License, or (at your option)# any lat
er version.## Please read the docs/COPYING file.import sysimport numpyimport has
hlibclass FastaOutput:def __init__(self, output_file_path):self.output_file_path
 = output_file_pathself.output_file_obj = open(output_file_path, 'w')def store(s
elf, entry, split = 
>temp_182|source:-*- coding: utf-8 -*-|start:6790|stop:7290
s += 1return Trueelse:return Falseif line.startswith('>'):self.file_pointer.seek
(self.file_pointer.tell() - len(line))breakqualscores += ' ' + line.strip()self.
quals = qualscores.strip()self.quals_int = [int(q) for q in self.quals.split()]s
elf.pos += 1return Truedef close(self):self.file_pointer.close()def reset(self):
self.pos = 0self.id  = Noneself.quals = Noneself.quals_int = Noneself.ids = []se
lf.file_pointer.seek(0)if __name__ == '__main__':fasta = SequenceSource(sys.argv
[1])fasta.visualize_
>temp_183|source:-*- coding: utf-8 -*-|start:6724|stop:7224
trip()self.quals_int = [int(q) for q in self.quals.split()]self.pos += 1return T
rueelse:return Falseif line.startswith('>'):self.file_pointer.seek(self.file_poi
nter.tell() - len(line))breakqualscores += ' ' + line.strip()self.quals = qualsc
ores.strip()self.quals_int = [int(q) for q in self.quals.split()]self.pos += 1re
turn Truedef close(self):self.file_pointer.close()def reset(self):self.pos = 0se
lf.id  = Noneself.quals = Noneself.quals_int = Noneself.ids = []self.file_pointe
r.seek(0)if __name__
>temp_184|source:-*- coding: utf-8 -*-|start:1670|stop:2170
se):self.fasta_file_path = fasta_file_pathself.name = Noneself.lazy_init = lazy_
initself.allow_mixed_case = allow_mixed_caseself.pos = 0self.id  = Noneself.seq 
= Noneself.ids = []self.unique = uniqueself.unique_hash_dict = {}self.unique_has
h_list = []self.unique_next_hash = 0self.file_pointer = open(self.fasta_file_pat
h)self.file_pointer.seek(0)if self.lazy_init:self.total_seq = Noneelse:self.tota
l_seq = len([l for l in self.file_pointer.readlines() if l.startswith('>')])self
.reset()if self.uniq
>temp_185|source:-*- coding: utf-8 -*-|start:6722|stop:7222
.strip()self.quals_int = [int(q) for q in self.quals.split()]self.pos += 1return
 Trueelse:return Falseif line.startswith('>'):self.file_pointer.seek(self.file_p
ointer.tell() - len(line))breakqualscores += ' ' + line.strip()self.quals = qual
scores.strip()self.quals_int = [int(q) for q in self.quals.split()]self.pos += 1
return Truedef close(self):self.file_pointer.close()def reset(self):self.pos = 0
self.id  = Noneself.quals = Noneself.quals_int = Noneself.ids = []self.file_poin
ter.seek(0)if __name
>temp_186|source:-*- coding: utf-8 -*-|start:2161|stop:2661
self.unique:self.init_unique_hash()def init_unique_hash(self):while self.next_re
gular():hash = hashlib.sha1(self.seq.upper()).hexdigest()if hash in self.unique_
hash_dict:self.unique_hash_dict[hash]['ids'].append(self.id)self.unique_hash_dic
t[hash]['count'] += 1else:self.unique_hash_dict[hash] = {'id' : self.id,'ids': [
self.id],'seq': self.seq,'count': 1}self.unique_hash_list = [i[1] for i in sorte
d([(self.unique_hash_dict[hash]['count'], hash)\for hash in self.unique_hash_dic
t], reverse = True)]
>temp_187|source:-*- coding: utf-8 -*-|start:5087|stop:5587
q_len_distribution) / 20 or 1plt.xticks(range(xtickstep, max_seq_len + 1, xticks
tep), rotation=90, size='xx-small')plt.yticks(range(0, max(seq_len_distribution)
 + 1, ytickstep),[y for y in range(0, max(seq_len_distribution) + 1, ytickstep)]
,size='xx-small')plt.xlim(xmin = 0, xmax = max_seq_len)plt.ylim(ymin = 0, ymax =
 max(seq_len_distribution) + (max(seq_len_distribution) / 20.0))plt.figtext(0.5,
 0.96, '%s' % (title), weight = 'black', size = 'xx-large', ha = 'center')ax1 = 
plt.subplot(gs[9])pl
>temp_188|source:-*- coding: utf-8 -*-|start:1729|stop:2229
elf.lazy_init = lazy_initself.allow_mixed_case = allow_mixed_caseself.pos = 0sel
f.id  = Noneself.seq = Noneself.ids = []self.unique = uniqueself.unique_hash_dic
t = {}self.unique_hash_list = []self.unique_next_hash = 0self.file_pointer = ope
n(self.fasta_file_path)self.file_pointer.seek(0)if self.lazy_init:self.total_seq
 = Noneelse:self.total_seq = len([l for l in self.file_pointer.readlines() if l.
startswith('>')])self.reset()if self.unique:self.init_unique_hash()def init_uniq
ue_hash(self):while 
>temp_189|source:-*- coding: utf-8 -*-|start:1353|stop:1853
[fastalib] Reading FASTA into memory: %s' % (self.fasta.pos))sys.stderr.flush()s
elf.ids.append(self.fasta.id)self.sequences.append(self.fasta.seq)sys.stderr.wri
te('\n')def close(self):self.fasta.close()class SequenceSource():def __init__(se
lf, fasta_file_path, lazy_init = True, unique = False, allow_mixed_case = False)
:self.fasta_file_path = fasta_file_pathself.name = Noneself.lazy_init = lazy_ini
tself.allow_mixed_case = allow_mixed_caseself.pos = 0self.id  = Noneself.seq = N
oneself.ids = []self
>temp_190|source:-*- coding: utf-8 -*-|start:794|stop:1294
 % id)def write_seq(self, seq, split = True):if split:seq = self.split(seq)self.
output_file_obj.write('%s\n' % seq)def split(self, sequence, piece_length = 80):
ticks = range(0, len(sequence), piece_length) + [len(sequence)]return '\n'.join(
[sequence[ticks[x]:ticks[x + 1]] for x in range(0, len(ticks) - 1)])def close(se
lf):self.output_file_obj.close()class ReadFasta:def __init__(self, f_name):self.
ids = []self.sequences = []self.fasta = SequenceSource(f_name)while self.fasta.n
ext():if self.fasta.
>temp_191|source:-*- coding: utf-8 -*-|start:3751|stop:4251
close()def reset(self):self.pos = 0self.id  = Noneself.seq = Noneself.ids = []se
lf.file_pointer.seek(0)def visualize_sequence_length_distribution(self, title, d
est = None, max_seq_len = None, xtickstep = None, ytickstep = None):import matpl
otlib.pyplot as pltimport matplotlib.gridspec as gridspecsequence_lengths = []se
lf.reset()while self.next():if self.pos % 10000 == 0 or self.pos == 1:sys.stderr
.write('\r[fastalib] Reading: %s' % (self.pos))sys.stderr.flush()sequence_length
s.append(len(self.se
>temp_192|source:-*- coding: utf-8 -*-|start:2582|stop:3082
dict[hash]['count'], hash)\for hash in self.unique_hash_dict], reverse = True)]s
elf.total_unique = len(self.unique_hash_dict)self.reset()def next(self):if self.
unique:return self.next_unique()else:return self.next_regular()def next_unique(s
elf):if self.unique:if self.total_unique > 0 and self.pos < self.total_unique:ha
sh_entry = self.unique_hash_dict[self.unique_hash_list[self.pos]]self.pos += 1se
lf.seq = hash_entry['seq'] if self.allow_mixed_case else hash_entry['seq'].upper
()self.id  = hash_en
>temp_193|source:-*- coding: utf-8 -*-|start:5206|stop:5706
yticks(range(0, max(seq_len_distribution) + 1, ytickstep),[y for y in range(0, m
ax(seq_len_distribution) + 1, ytickstep)],size='xx-small')plt.xlim(xmin = 0, xma
x = max_seq_len)plt.ylim(ymin = 0, ymax = max(seq_len_distribution) + (max(seq_l
en_distribution) / 20.0))plt.figtext(0.5, 0.96, '%s' % (title), weight = 'black'
, size = 'xx-large', ha = 'center')ax1 = plt.subplot(gs[9])plt.rcParams.update({
'axes.edgecolor' : 20})plt.grid(False)plt.yticks([])plt.xticks([])plt.text(0.02,
 0.5, 'total: %s / m
>temp_194|source:-*- coding: utf-8 -*-|start:1601|stop:2101
a_file_path, lazy_init = True, unique = False, allow_mixed_case = False):self.fa
sta_file_path = fasta_file_pathself.name = Noneself.lazy_init = lazy_initself.al
low_mixed_case = allow_mixed_caseself.pos = 0self.id  = Noneself.seq = Noneself.
ids = []self.unique = uniqueself.unique_hash_dict = {}self.unique_hash_list = []
self.unique_next_hash = 0self.file_pointer = open(self.fasta_file_path)self.file
_pointer.seek(0)if self.lazy_init:self.total_seq = Noneelse:self.total_seq = len
([l for l in self.fi
>temp_195|source:-*- coding: utf-8 -*-|start:1096|stop:1596
- 1)])def close(self):self.output_file_obj.close()class ReadFasta:def __init__(s
elf, f_name):self.ids = []self.sequences = []self.fasta = SequenceSource(f_name)
while self.fasta.next():if self.fasta.pos % 1000 == 0 or self.fasta.pos == 1:sys
.stderr.write('\r[fastalib] Reading FASTA into memory: %s' % (self.fasta.pos))sy
s.stderr.flush()self.ids.append(self.fasta.id)self.sequences.append(self.fasta.s
eq)sys.stderr.write('\n')def close(self):self.fasta.close()class SequenceSource(
):def __init__(self,
>temp_196|source:-*- coding: utf-8 -*-|start:5019|stop:5519
tep = (max_seq_len / 50) or 1if ytickstep == None:ytickstep = max(seq_len_distri
bution) / 20 or 1plt.xticks(range(xtickstep, max_seq_len + 1, xtickstep), rotati
on=90, size='xx-small')plt.yticks(range(0, max(seq_len_distribution) + 1, yticks
tep),[y for y in range(0, max(seq_len_distribution) + 1, ytickstep)],size='xx-sm
all')plt.xlim(xmin = 0, xmax = max_seq_len)plt.ylim(ymin = 0, ymax = max(seq_len
_distribution) + (max(seq_len_distribution) / 20.0))plt.figtext(0.5, 0.96, '%s' 
% (title), weight = 
>temp_197|source:-*- coding: utf-8 -*-|start:409|stop:909
le_path):self.output_file_path = output_file_pathself.output_file_obj = open(out
put_file_path, 'w')def store(self, entry, split = True, store_frequencies = True
):if entry.unique and store_frequencies:self.write_id('%s|%s' % (entry.id, 'freq
uency:%d' % len(entry.ids)))else:self.write_id(entry.id)self.write_seq(entry.seq
, split)def write_id(self, id):self.output_file_obj.write('>%s\n' % id)def write
_seq(self, seq, split = True):if split:seq = self.split(seq)self.output_file_obj
.write('%s\n' % seq)
>temp_198|source:-*- coding: utf-8 -*-|start:4623|stop:5123
.1)gs = gridspec.GridSpec(10, 1)ax1 = plt.subplot(gs[0:8])plt.grid(True)plt.subp
lots_adjust(left=0.05, bottom = 0.03, top = 0.95, right = 0.98)plt.plot(seq_len_
distribution, color = 'black', alpha = 0.3)plt.fill_between(range(0, max_seq_len
 + 1), seq_len_distribution, y2 = 0, color = 'black', alpha = 0.15)plt.ylabel('n
umber of sequences')plt.xlabel('sequence length')if xtickstep == None:xtickstep 
= (max_seq_len / 50) or 1if ytickstep == None:ytickstep = max(seq_len_distributi
on) / 20 or 1plt.xti
>temp_199|source:-*- coding: utf-8 -*-|start:6112|stop:6612
h, lazy_init = True):self.quals_file_path = quals_file_pathself.name = Noneself.
lazy_init = lazy_initself.pos = 0self.id  = Noneself.quals = Noneself.quals_int 
= Noneself.ids = []self.file_pointer = open(self.quals_file_path)self.file_point
er.seek(0)if self.lazy_init:self.total_quals = Noneelse:self.total_quals = len([
l for l in self.file_pointer.readlines() if l.startswith('>')])self.reset()def n
ext(self):self.id = self.file_pointer.readline()[1:].strip()self.quals = Nonesel
f.quals_int = Nonequ
>temp_200|source:-*- coding: utf-8 -*-|start:6807|stop:7307
else:return Falseif line.startswith('>'):self.file_pointer.seek(self.file_pointe
r.tell() - len(line))breakqualscores += ' ' + line.strip()self.quals = qualscore
s.strip()self.quals_int = [int(q) for q in self.quals.split()]self.pos += 1retur
n Truedef close(self):self.file_pointer.close()def reset(self):self.pos = 0self.
id  = Noneself.quals = Noneself.quals_int = Noneself.ids = []self.file_pointer.s
eek(0)if __name__ == '__main__':fasta = SequenceSource(sys.argv[1])fasta.visuali
ze_sequence_length_d
>temp_201|source:-*- coding: utf-8 -*-|start:2037|stop:2537
lf.total_seq = Noneelse:self.total_seq = len([l for l in self.file_pointer.readl
ines() if l.startswith('>')])self.reset()if self.unique:self.init_unique_hash()d
ef init_unique_hash(self):while self.next_regular():hash = hashlib.sha1(self.seq
.upper()).hexdigest()if hash in self.unique_hash_dict:self.unique_hash_dict[hash
]['ids'].append(self.id)self.unique_hash_dict[hash]['count'] += 1else:self.uniqu
e_hash_dict[hash] = {'id' : self.id,'ids': [self.id],'seq': self.seq,'count': 1}
self.unique_hash_lis
>temp_202|source:-*- coding: utf-8 -*-|start:1612|stop:2112
, lazy_init = True, unique = False, allow_mixed_case = False):self.fasta_file_pa
th = fasta_file_pathself.name = Noneself.lazy_init = lazy_initself.allow_mixed_c
ase = allow_mixed_caseself.pos = 0self.id  = Noneself.seq = Noneself.ids = []sel
f.unique = uniqueself.unique_hash_dict = {}self.unique_hash_list = []self.unique
_next_hash = 0self.file_pointer = open(self.fasta_file_path)self.file_pointer.se
ek(0)if self.lazy_init:self.total_seq = Noneelse:self.total_seq = len([l for l i
n self.file_pointer.
>temp_203|source:-*- coding: utf-8 -*-|start:2878|stop:3378
s < self.total_unique:hash_entry = self.unique_hash_dict[self.unique_hash_list[s
elf.pos]]self.pos += 1self.seq = hash_entry['seq'] if self.allow_mixed_case else
 hash_entry['seq'].upper()self.id  = hash_entry['id']self.ids = hash_entry['ids'
]return Trueelse:return Falseelse:return Falsedef next_regular(self):self.seq = 
Noneself.id = self.file_pointer.readline()[1:].strip()sequence = ''while 1:line 
= self.file_pointer.readline()if not line:if len(sequence):self.seq = sequencese
lf.pos += 1return Tr
>temp_204|source:-*- coding: utf-8 -*-|start:799|stop:1299
)def write_seq(self, seq, split = True):if split:seq = self.split(seq)self.outpu
t_file_obj.write('%s\n' % seq)def split(self, sequence, piece_length = 80):ticks
 = range(0, len(sequence), piece_length) + [len(sequence)]return '\n'.join([sequ
ence[ticks[x]:ticks[x + 1]] for x in range(0, len(ticks) - 1)])def close(self):s
elf.output_file_obj.close()class ReadFasta:def __init__(self, f_name):self.ids =
 []self.sequences = []self.fasta = SequenceSource(f_name)while self.fasta.next()
:if self.fasta.pos %
>temp_205|source:-*- coding: utf-8 -*-|start:3093|stop:3593
lf.ids = hash_entry['ids']return Trueelse:return Falseelse:return Falsedef next_
regular(self):self.seq = Noneself.id = self.file_pointer.readline()[1:].strip()s
equence = ''while 1:line = self.file_pointer.readline()if not line:if len(sequen
ce):self.seq = sequenceself.pos += 1return Trueelse:return Falseif line.startswi
th('>'):self.file_pointer.seek(self.file_pointer.tell() - len(line))breaksequenc
e += line.strip()self.seq = sequence if self.allow_mixed_case else sequence.uppe
r()self.pos += 1retu
>temp_206|source:-*- coding: utf-8 -*-|start:5216|stop:5716
ge(0, max(seq_len_distribution) + 1, ytickstep),[y for y in range(0, max(seq_len
_distribution) + 1, ytickstep)],size='xx-small')plt.xlim(xmin = 0, xmax = max_se
q_len)plt.ylim(ymin = 0, ymax = max(seq_len_distribution) + (max(seq_len_distrib
ution) / 20.0))plt.figtext(0.5, 0.96, '%s' % (title), weight = 'black', size = '
xx-large', ha = 'center')ax1 = plt.subplot(gs[9])plt.rcParams.update({'axes.edge
color' : 20})plt.grid(False)plt.yticks([])plt.xticks([])plt.text(0.02, 0.5, 'tot
al: %s / mean: %.2f 
>temp_207|source:-*- coding: utf-8 -*-|start:1257|stop:1757
hile self.fasta.next():if self.fasta.pos % 1000 == 0 or self.fasta.pos == 1:sys.
stderr.write('\r[fastalib] Reading FASTA into memory: %s' % (self.fasta.pos))sys
.stderr.flush()self.ids.append(self.fasta.id)self.sequences.append(self.fasta.se
q)sys.stderr.write('\n')def close(self):self.fasta.close()class SequenceSource()
:def __init__(self, fasta_file_path, lazy_init = True, unique = False, allow_mix
ed_case = False):self.fasta_file_path = fasta_file_pathself.name = Noneself.lazy
_init = lazy_initsel
>temp_208|source:-*- coding: utf-8 -*-|start:3369|stop:3869
return Trueelse:return Falseif line.startswith('>'):self.file_pointer.seek(self.
file_pointer.tell() - len(line))breaksequence += line.strip()self.seq = sequence
 if self.allow_mixed_case else sequence.upper()self.pos += 1return Truedef get_s
eq_by_read_id(self, read_id):self.reset()while self.next():if self.id == read_id
:return self.seqreturn Falsedef close(self):self.file_pointer.close()def reset(s
elf):self.pos = 0self.id  = Noneself.seq = Noneself.ids = []self.file_pointer.se
ek(0)def visualize_s
>temp_209|source:-*- coding: utf-8 -*-|start:1756|stop:2256
lf.allow_mixed_case = allow_mixed_caseself.pos = 0self.id  = Noneself.seq = None
self.ids = []self.unique = uniqueself.unique_hash_dict = {}self.unique_hash_list
 = []self.unique_next_hash = 0self.file_pointer = open(self.fasta_file_path)self
.file_pointer.seek(0)if self.lazy_init:self.total_seq = Noneelse:self.total_seq 
= len([l for l in self.file_pointer.readlines() if l.startswith('>')])self.reset
()if self.unique:self.init_unique_hash()def init_unique_hash(self):while self.ne
xt_regular():hash = 
>temp_210|source:-*- coding: utf-8 -*-|start:4662|stop:5162
lt.subplot(gs[0:8])plt.grid(True)plt.subplots_adjust(left=0.05, bottom = 0.03, t
op = 0.95, right = 0.98)plt.plot(seq_len_distribution, color = 'black', alpha = 
0.3)plt.fill_between(range(0, max_seq_len + 1), seq_len_distribution, y2 = 0, co
lor = 'black', alpha = 0.15)plt.ylabel('number of sequences')plt.xlabel('sequenc
e length')if xtickstep == None:xtickstep = (max_seq_len / 50) or 1if ytickstep =
= None:ytickstep = max(seq_len_distribution) / 20 or 1plt.xticks(range(xtickstep
, max_seq_len + 1, x
>temp_211|source:-*- coding: utf-8 -*-|start:4077|stop:4577
et()while self.next():if self.pos % 10000 == 0 or self.pos == 1:sys.stderr.write
('\r[fastalib] Reading: %s' % (self.pos))sys.stderr.flush()sequence_lengths.appe
nd(len(self.seq))self.reset()sys.stderr.write('\n')if not max_seq_len:max_seq_le
n = max(sequence_lengths) + (int(max(sequence_lengths) / 100.0) or 10)seq_len_di
stribution = [0] * (max_seq_len + 1)for l in sequence_lengths:seq_len_distributi
on[l] += 1fig = plt.figure(figsize = (16, 12))plt.rcParams.update({'axes.linewid
th' : 0.9})plt.rc('g
>temp_212|source:-*- coding: utf-8 -*-|start:2046|stop:2546
seq = Noneelse:self.total_seq = len([l for l in self.file_pointer.readlines() if
 l.startswith('>')])self.reset()if self.unique:self.init_unique_hash()def init_u
nique_hash(self):while self.next_regular():hash = hashlib.sha1(self.seq.upper())
.hexdigest()if hash in self.unique_hash_dict:self.unique_hash_dict[hash]['ids'].
append(self.id)self.unique_hash_dict[hash]['count'] += 1else:self.unique_hash_di
ct[hash] = {'id' : self.id,'ids': [self.id],'seq': self.seq,'count': 1}self.uniq
ue_hash_list = [i[1]
>temp_213|source:-*- coding: utf-8 -*-|start:5294|stop:5794
en_distribution) + 1, ytickstep)],size='xx-small')plt.xlim(xmin = 0, xmax = max_
seq_len)plt.ylim(ymin = 0, ymax = max(seq_len_distribution) + (max(seq_len_distr
ibution) / 20.0))plt.figtext(0.5, 0.96, '%s' % (title), weight = 'black', size =
 'xx-large', ha = 'center')ax1 = plt.subplot(gs[9])plt.rcParams.update({'axes.ed
gecolor' : 20})plt.grid(False)plt.yticks([])plt.xticks([])plt.text(0.02, 0.5, 't
otal: %s / mean: %.2f / std: %.2f / min: %s / max: %s'\% (len(sequence_lengths),
numpy.mean(sequence_
>temp_214|source:-*- coding: utf-8 -*-|start:5331|stop:5831
e='xx-small')plt.xlim(xmin = 0, xmax = max_seq_len)plt.ylim(ymin = 0, ymax = max
(seq_len_distribution) + (max(seq_len_distribution) / 20.0))plt.figtext(0.5, 0.9
6, '%s' % (title), weight = 'black', size = 'xx-large', ha = 'center')ax1 = plt.
subplot(gs[9])plt.rcParams.update({'axes.edgecolor' : 20})plt.grid(False)plt.yti
cks([])plt.xticks([])plt.text(0.02, 0.5, 'total: %s / mean: %.2f / std: %.2f / m
in: %s / max: %s'\% (len(sequence_lengths),numpy.mean(sequence_lengths), numpy.s
td(sequence_lengths)
>temp_215|source:-*- coding: utf-8 -*-|start:6667|stop:7167
)if not line:if len(qualscores):self.quals = qualscores.strip()self.quals_int = 
[int(q) for q in self.quals.split()]self.pos += 1return Trueelse:return Falseif 
line.startswith('>'):self.file_pointer.seek(self.file_pointer.tell() - len(line)
)breakqualscores += ' ' + line.strip()self.quals = qualscores.strip()self.quals_
int = [int(q) for q in self.quals.split()]self.pos += 1return Truedef close(self
):self.file_pointer.close()def reset(self):self.pos = 0self.id  = Noneself.quals
 = Noneself.quals_in
>temp_216|source:-*- coding: utf-8 -*-|start:148|stop:648
General Public License as published by the Free# Software Foundation; either ver
sion 2 of the License, or (at your option)# any later version.## Please read the
 docs/COPYING file.import sysimport numpyimport hashlibclass FastaOutput:def __i
nit__(self, output_file_path):self.output_file_path = output_file_pathself.outpu
t_file_obj = open(output_file_path, 'w')def store(self, entry, split = True, sto
re_frequencies = True):if entry.unique and store_frequencies:self.write_id('%s|%
s' % (entry.id, 'fre
>temp_217|source:-*- coding: utf-8 -*-|start:680|stop:1180
e:self.write_id(entry.id)self.write_seq(entry.seq, split)def write_id(self, id):
self.output_file_obj.write('>%s\n' % id)def write_seq(self, seq, split = True):i
f split:seq = self.split(seq)self.output_file_obj.write('%s\n' % seq)def split(s
elf, sequence, piece_length = 80):ticks = range(0, len(sequence), piece_length) 
+ [len(sequence)]return '\n'.join([sequence[ticks[x]:ticks[x + 1]] for x in rang
e(0, len(ticks) - 1)])def close(self):self.output_file_obj.close()class ReadFast
a:def __init__(self,
>temp_218|source:-*- coding: utf-8 -*-|start:6010|stop:6510
avefig(dest + '.png')try:plt.show()except:passreturnclass QualSource:def __init_
_(self, quals_file_path, lazy_init = True):self.quals_file_path = quals_file_pat
hself.name = Noneself.lazy_init = lazy_initself.pos = 0self.id  = Noneself.quals
 = Noneself.quals_int = Noneself.ids = []self.file_pointer = open(self.quals_fil
e_path)self.file_pointer.seek(0)if self.lazy_init:self.total_quals = Noneelse:se
lf.total_quals = len([l for l in self.file_pointer.readlines() if l.startswith('
>')])self.reset()def
>temp_219|source:-*- coding: utf-8 -*-|start:4909|stop:5409
black', alpha = 0.15)plt.ylabel('number of sequences')plt.xlabel('sequence lengt
h')if xtickstep == None:xtickstep = (max_seq_len / 50) or 1if ytickstep == None:
ytickstep = max(seq_len_distribution) / 20 or 1plt.xticks(range(xtickstep, max_s
eq_len + 1, xtickstep), rotation=90, size='xx-small')plt.yticks(range(0, max(seq
_len_distribution) + 1, ytickstep),[y for y in range(0, max(seq_len_distribution
) + 1, ytickstep)],size='xx-small')plt.xlim(xmin = 0, xmax = max_seq_len)plt.yli
m(ymin = 0, ymax = m
>temp_220|source:-*- coding: utf-8 -*-|start:5991|stop:6491
'.pdf')except:plt.savefig(dest + '.png')try:plt.show()except:passreturnclass Qua
lSource:def __init__(self, quals_file_path, lazy_init = True):self.quals_file_pa
th = quals_file_pathself.name = Noneself.lazy_init = lazy_initself.pos = 0self.i
d  = Noneself.quals = Noneself.quals_int = Noneself.ids = []self.file_pointer = 
open(self.quals_file_path)self.file_pointer.seek(0)if self.lazy_init:self.total_
quals = Noneelse:self.total_quals = len([l for l in self.file_pointer.readlines(
) if l.startswith('>
>temp_221|source:-*- coding: utf-8 -*-|start:1809|stop:2309
f.id  = Noneself.seq = Noneself.ids = []self.unique = uniqueself.unique_hash_dic
t = {}self.unique_hash_list = []self.unique_next_hash = 0self.file_pointer = ope
n(self.fasta_file_path)self.file_pointer.seek(0)if self.lazy_init:self.total_seq
 = Noneelse:self.total_seq = len([l for l in self.file_pointer.readlines() if l.
startswith('>')])self.reset()if self.unique:self.init_unique_hash()def init_uniq
ue_hash(self):while self.next_regular():hash = hashlib.sha1(self.seq.upper()).he
xdigest()if hash in 
>temp_222|source:-*- coding: utf-8 -*-|start:2552|stop:3052
 in sorted([(self.unique_hash_dict[hash]['count'], hash)\for hash in self.unique
_hash_dict], reverse = True)]self.total_unique = len(self.unique_hash_dict)self.
reset()def next(self):if self.unique:return self.next_unique()else:return self.n
ext_regular()def next_unique(self):if self.unique:if self.total_unique > 0 and s
elf.pos < self.total_unique:hash_entry = self.unique_hash_dict[self.unique_hash_
list[self.pos]]self.pos += 1self.seq = hash_entry['seq'] if self.allow_mixed_cas
e else hash_entry['s
>temp_223|source:-*- coding: utf-8 -*-|start:2815|stop:3315
nique(self):if self.unique:if self.total_unique > 0 and self.pos < self.total_un
ique:hash_entry = self.unique_hash_dict[self.unique_hash_list[self.pos]]self.pos
 += 1self.seq = hash_entry['seq'] if self.allow_mixed_case else hash_entry['seq'
].upper()self.id  = hash_entry['id']self.ids = hash_entry['ids']return Trueelse:
return Falseelse:return Falsedef next_regular(self):self.seq = Noneself.id = sel
f.file_pointer.readline()[1:].strip()sequence = ''while 1:line = self.file_point
er.readline()if not 
>temp_224|source:-*- coding: utf-8 -*-|start:6009|stop:6509
savefig(dest + '.png')try:plt.show()except:passreturnclass QualSource:def __init
__(self, quals_file_path, lazy_init = True):self.quals_file_path = quals_file_pa
thself.name = Noneself.lazy_init = lazy_initself.pos = 0self.id  = Noneself.qual
s = Noneself.quals_int = Noneself.ids = []self.file_pointer = open(self.quals_fi
le_path)self.file_pointer.seek(0)if self.lazy_init:self.total_quals = Noneelse:s
elf.total_quals = len([l for l in self.file_pointer.readlines() if l.startswith(
'>')])self.reset()de
>temp_225|source:-*- coding: utf-8 -*-|start:4498|stop:4998
igure(figsize = (16, 12))plt.rcParams.update({'axes.linewidth' : 0.9})plt.rc('gr
id', color='0.50', linestyle='-', linewidth=0.1)gs = gridspec.GridSpec(10, 1)ax1
 = plt.subplot(gs[0:8])plt.grid(True)plt.subplots_adjust(left=0.05, bottom = 0.0
3, top = 0.95, right = 0.98)plt.plot(seq_len_distribution, color = 'black', alph
a = 0.3)plt.fill_between(range(0, max_seq_len + 1), seq_len_distribution, y2 = 0
, color = 'black', alpha = 0.15)plt.ylabel('number of sequences')plt.xlabel('seq
uence length')if xti
>temp_226|source:-*- coding: utf-8 -*-|start:1630|stop:2130
, unique = False, allow_mixed_case = False):self.fasta_file_path = fasta_file_pa
thself.name = Noneself.lazy_init = lazy_initself.allow_mixed_case = allow_mixed_
caseself.pos = 0self.id  = Noneself.seq = Noneself.ids = []self.unique = uniques
elf.unique_hash_dict = {}self.unique_hash_list = []self.unique_next_hash = 0self
.file_pointer = open(self.fasta_file_path)self.file_pointer.seek(0)if self.lazy_
init:self.total_seq = Noneelse:self.total_seq = len([l for l in self.file_pointe
r.readlines() if l.s
>temp_227|source:-*- coding: utf-8 -*-|start:3761|stop:4261
 reset(self):self.pos = 0self.id  = Noneself.seq = Noneself.ids = []self.file_po
inter.seek(0)def visualize_sequence_length_distribution(self, title, dest = None
, max_seq_len = None, xtickstep = None, ytickstep = None):import matplotlib.pypl
ot as pltimport matplotlib.gridspec as gridspecsequence_lengths = []self.reset()
while self.next():if self.pos % 10000 == 0 or self.pos == 1:sys.stderr.write('\r
[fastalib] Reading: %s' % (self.pos))sys.stderr.flush()sequence_lengths.append(l
en(self.seq))self.re
>temp_228|source:-*- coding: utf-8 -*-|start:4608|stop:5108
-', linewidth=0.1)gs = gridspec.GridSpec(10, 1)ax1 = plt.subplot(gs[0:8])plt.gri
d(True)plt.subplots_adjust(left=0.05, bottom = 0.03, top = 0.95, right = 0.98)pl
t.plot(seq_len_distribution, color = 'black', alpha = 0.3)plt.fill_between(range
(0, max_seq_len + 1), seq_len_distribution, y2 = 0, color = 'black', alpha = 0.1
5)plt.ylabel('number of sequences')plt.xlabel('sequence length')if xtickstep == 
None:xtickstep = (max_seq_len / 50) or 1if ytickstep == None:ytickstep = max(seq
_len_distribution) /
>temp_229|source:-*- coding: utf-8 -*-|start:2347|stop:2847
_dict[hash]['ids'].append(self.id)self.unique_hash_dict[hash]['count'] += 1else:
self.unique_hash_dict[hash] = {'id' : self.id,'ids': [self.id],'seq': self.seq,'
count': 1}self.unique_hash_list = [i[1] for i in sorted([(self.unique_hash_dict[
hash]['count'], hash)\for hash in self.unique_hash_dict], reverse = True)]self.t
otal_unique = len(self.unique_hash_dict)self.reset()def next(self):if self.uniqu
e:return self.next_unique()else:return self.next_regular()def next_unique(self):
if self.unique:if se
>temp_230|source:-*- coding: utf-8 -*-|start:155|stop:655
 Public License as published by the Free# Software Foundation; either version 2 
of the License, or (at your option)# any later version.## Please read the docs/C
OPYING file.import sysimport numpyimport hashlibclass FastaOutput:def __init__(s
elf, output_file_path):self.output_file_path = output_file_pathself.output_file_
obj = open(output_file_path, 'w')def store(self, entry, split = True, store_freq
uencies = True):if entry.unique and store_frequencies:self.write_id('%s|%s' % (e
ntry.id, 'frequency:
>temp_231|source:-*- coding: utf-8 -*-|start:708|stop:1208
f.write_seq(entry.seq, split)def write_id(self, id):self.output_file_obj.write('
>%s\n' % id)def write_seq(self, seq, split = True):if split:seq = self.split(seq
)self.output_file_obj.write('%s\n' % seq)def split(self, sequence, piece_length 
= 80):ticks = range(0, len(sequence), piece_length) + [len(sequence)]return '\n'
.join([sequence[ticks[x]:ticks[x + 1]] for x in range(0, len(ticks) - 1)])def cl
ose(self):self.output_file_obj.close()class ReadFasta:def __init__(self, f_name)
:self.ids = []self.s
>temp_232|source:-*- coding: utf-8 -*-|start:2621|stop:3121
self.unique_hash_dict], reverse = True)]self.total_unique = len(self.unique_hash
_dict)self.reset()def next(self):if self.unique:return self.next_unique()else:re
turn self.next_regular()def next_unique(self):if self.unique:if self.total_uniqu
e > 0 and self.pos < self.total_unique:hash_entry = self.unique_hash_dict[self.u
nique_hash_list[self.pos]]self.pos += 1self.seq = hash_entry['seq'] if self.allo
w_mixed_case else hash_entry['seq'].upper()self.id  = hash_entry['id']self.ids =
 hash_entry['ids']re
>temp_233|source:-*- coding: utf-8 -*-|start:2980|stop:3480
self.seq = hash_entry['seq'] if self.allow_mixed_case else hash_entry['seq'].upp
er()self.id  = hash_entry['id']self.ids = hash_entry['ids']return Trueelse:retur
n Falseelse:return Falsedef next_regular(self):self.seq = Noneself.id = self.fil
e_pointer.readline()[1:].strip()sequence = ''while 1:line = self.file_pointer.re
adline()if not line:if len(sequence):self.seq = sequenceself.pos += 1return True
else:return Falseif line.startswith('>'):self.file_pointer.seek(self.file_pointe
r.tell() - len(line)
>temp_234|source:-*- coding: utf-8 -*-|start:3572|stop:4072
er()self.pos += 1return Truedef get_seq_by_read_id(self, read_id):self.reset()wh
ile self.next():if self.id == read_id:return self.seqreturn Falsedef close(self)
:self.file_pointer.close()def reset(self):self.pos = 0self.id  = Noneself.seq = 
Noneself.ids = []self.file_pointer.seek(0)def visualize_sequence_length_distribu
tion(self, title, dest = None, max_seq_len = None, xtickstep = None, ytickstep =
 None):import matplotlib.pyplot as pltimport matplotlib.gridspec as gridspecsequ
ence_lengths = []sel
>temp_235|source:-*- coding: utf-8 -*-|start:995|stop:1495
gth) + [len(sequence)]return '\n'.join([sequence[ticks[x]:ticks[x + 1]] for x in
 range(0, len(ticks) - 1)])def close(self):self.output_file_obj.close()class Rea
dFasta:def __init__(self, f_name):self.ids = []self.sequences = []self.fasta = S
equenceSource(f_name)while self.fasta.next():if self.fasta.pos % 1000 == 0 or se
lf.fasta.pos == 1:sys.stderr.write('\r[fastalib] Reading FASTA into memory: %s' 
% (self.fasta.pos))sys.stderr.flush()self.ids.append(self.fasta.id)self.sequence
s.append(self.fasta.
>temp_236|source:-*- coding: utf-8 -*-|start:3979|stop:4479
import matplotlib.pyplot as pltimport matplotlib.gridspec as gridspecsequence_le
ngths = []self.reset()while self.next():if self.pos % 10000 == 0 or self.pos == 
1:sys.stderr.write('\r[fastalib] Reading: %s' % (self.pos))sys.stderr.flush()seq
uence_lengths.append(len(self.seq))self.reset()sys.stderr.write('\n')if not max_
seq_len:max_seq_len = max(sequence_lengths) + (int(max(sequence_lengths) / 100.0
) or 10)seq_len_distribution = [0] * (max_seq_len + 1)for l in sequence_lengths:
seq_len_distribution
>temp_237|source:-*- coding: utf-8 -*-|start:6365|stop:6865
self.lazy_init:self.total_quals = Noneelse:self.total_quals = len([l for l in se
lf.file_pointer.readlines() if l.startswith('>')])self.reset()def next(self):sel
f.id = self.file_pointer.readline()[1:].strip()self.quals = Noneself.quals_int =
 Nonequalscores = ''while 1:line = self.file_pointer.readline()if not line:if le
n(qualscores):self.quals = qualscores.strip()self.quals_int = [int(q) for q in s
elf.quals.split()]self.pos += 1return Trueelse:return Falseif line.startswith('>
'):self.file_pointer
>temp_238|source:-*- coding: utf-8 -*-|start:3341|stop:3841
.seq = sequenceself.pos += 1return Trueelse:return Falseif line.startswith('>'):
self.file_pointer.seek(self.file_pointer.tell() - len(line))breaksequence += lin
e.strip()self.seq = sequence if self.allow_mixed_case else sequence.upper()self.
pos += 1return Truedef get_seq_by_read_id(self, read_id):self.reset()while self.
next():if self.id == read_id:return self.seqreturn Falsedef close(self):self.fil
e_pointer.close()def reset(self):self.pos = 0self.id  = Noneself.seq = Noneself.
ids = []self.file_po
>temp_239|source:-*- coding: utf-8 -*-|start:5253|stop:5753
ytickstep),[y for y in range(0, max(seq_len_distribution) + 1, ytickstep)],size=
'xx-small')plt.xlim(xmin = 0, xmax = max_seq_len)plt.ylim(ymin = 0, ymax = max(s
eq_len_distribution) + (max(seq_len_distribution) / 20.0))plt.figtext(0.5, 0.96,
 '%s' % (title), weight = 'black', size = 'xx-large', ha = 'center')ax1 = plt.su
bplot(gs[9])plt.rcParams.update({'axes.edgecolor' : 20})plt.grid(False)plt.ytick
s([])plt.xticks([])plt.text(0.02, 0.5, 'total: %s / mean: %.2f / std: %.2f / min
: %s / max: %s'\% (l
>temp_240|source:-*- coding: utf-8 -*-|start:5054|stop:5554
ckstep == None:ytickstep = max(seq_len_distribution) / 20 or 1plt.xticks(range(x
tickstep, max_seq_len + 1, xtickstep), rotation=90, size='xx-small')plt.yticks(r
ange(0, max(seq_len_distribution) + 1, ytickstep),[y for y in range(0, max(seq_l
en_distribution) + 1, ytickstep)],size='xx-small')plt.xlim(xmin = 0, xmax = max_
seq_len)plt.ylim(ymin = 0, ymax = max(seq_len_distribution) + (max(seq_len_distr
ibution) / 20.0))plt.figtext(0.5, 0.96, '%s' % (title), weight = 'black', size =
 'xx-large', ha = 'c
>temp_241|source:-*- coding: utf-8 -*-|start:6263|stop:6763
uals_int = Noneself.ids = []self.file_pointer = open(self.quals_file_path)self.f
ile_pointer.seek(0)if self.lazy_init:self.total_quals = Noneelse:self.total_qual
s = len([l for l in self.file_pointer.readlines() if l.startswith('>')])self.res
et()def next(self):self.id = self.file_pointer.readline()[1:].strip()self.quals 
= Noneself.quals_int = Nonequalscores = ''while 1:line = self.file_pointer.readl
ine()if not line:if len(qualscores):self.quals = qualscores.strip()self.quals_in
t = [int(q) for q in
>temp_242|source:-*- coding: utf-8 -*-|start:3848|stop:4348
eek(0)def visualize_sequence_length_distribution(self, title, dest = None, max_s
eq_len = None, xtickstep = None, ytickstep = None):import matplotlib.pyplot as p
ltimport matplotlib.gridspec as gridspecsequence_lengths = []self.reset()while s
elf.next():if self.pos % 10000 == 0 or self.pos == 1:sys.stderr.write('\r[fastal
ib] Reading: %s' % (self.pos))sys.stderr.flush()sequence_lengths.append(len(self
.seq))self.reset()sys.stderr.write('\n')if not max_seq_len:max_seq_len = max(seq
uence_lengths) + (in
>temp_243|source:-*- coding: utf-8 -*-|start:5712|stop:6212
.2f / std: %.2f / min: %s / max: %s'\% (len(sequence_lengths),numpy.mean(sequenc
e_lengths), numpy.std(sequence_lengths),\min(sequence_lengths),\max(sequence_len
gths)),\va = 'center', alpha = 0.8, size = 'x-large')if dest == None:dest = self
.fasta_file_pathtry:plt.savefig(dest + '.pdf')except:plt.savefig(dest + '.png')t
ry:plt.show()except:passreturnclass QualSource:def __init__(self, quals_file_pat
h, lazy_init = True):self.quals_file_path = quals_file_pathself.name = Noneself.
lazy_init = lazy_ini
>temp_244|source:-*- coding: utf-8 -*-|start:4661|stop:5161
plt.subplot(gs[0:8])plt.grid(True)plt.subplots_adjust(left=0.05, bottom = 0.03, 
top = 0.95, right = 0.98)plt.plot(seq_len_distribution, color = 'black', alpha =
 0.3)plt.fill_between(range(0, max_seq_len + 1), seq_len_distribution, y2 = 0, c
olor = 'black', alpha = 0.15)plt.ylabel('number of sequences')plt.xlabel('sequen
ce length')if xtickstep == None:xtickstep = (max_seq_len / 50) or 1if ytickstep 
== None:ytickstep = max(seq_len_distribution) / 20 or 1plt.xticks(range(xtickste
p, max_seq_len + 1, 
>temp_245|source:-*- coding: utf-8 -*-|start:3109|stop:3609
try['ids']return Trueelse:return Falseelse:return Falsedef next_regular(self):se
lf.seq = Noneself.id = self.file_pointer.readline()[1:].strip()sequence = ''whil
e 1:line = self.file_pointer.readline()if not line:if len(sequence):self.seq = s
equenceself.pos += 1return Trueelse:return Falseif line.startswith('>'):self.fil
e_pointer.seek(self.file_pointer.tell() - len(line))breaksequence += line.strip(
)self.seq = sequence if self.allow_mixed_case else sequence.upper()self.pos += 1
return Truedef get_s
>temp_246|source:-*- coding: utf-8 -*-|start:1908|stop:2408
ash_list = []self.unique_next_hash = 0self.file_pointer = open(self.fasta_file_p
ath)self.file_pointer.seek(0)if self.lazy_init:self.total_seq = Noneelse:self.to
tal_seq = len([l for l in self.file_pointer.readlines() if l.startswith('>')])se
lf.reset()if self.unique:self.init_unique_hash()def init_unique_hash(self):while
 self.next_regular():hash = hashlib.sha1(self.seq.upper()).hexdigest()if hash in
 self.unique_hash_dict:self.unique_hash_dict[hash]['ids'].append(self.id)self.un
ique_hash_dict[hash]
>temp_247|source:-*- coding: utf-8 -*-|start:6218|stop:6718
pos = 0self.id  = Noneself.quals = Noneself.quals_int = Noneself.ids = []self.fi
le_pointer = open(self.quals_file_path)self.file_pointer.seek(0)if self.lazy_ini
t:self.total_quals = Noneelse:self.total_quals = len([l for l in self.file_point
er.readlines() if l.startswith('>')])self.reset()def next(self):self.id = self.f
ile_pointer.readline()[1:].strip()self.quals = Noneself.quals_int = Nonequalscor
es = ''while 1:line = self.file_pointer.readline()if not line:if len(qualscores)
:self.quals = qualsc
>temp_248|source:-*- coding: utf-8 -*-|start:172|stop:672
s published by the Free# Software Foundation; either version 2 of the License, o
r (at your option)# any later version.## Please read the docs/COPYING file.impor
t sysimport numpyimport hashlibclass FastaOutput:def __init__(self, output_file_
path):self.output_file_path = output_file_pathself.output_file_obj = open(output
_file_path, 'w')def store(self, entry, split = True, store_frequencies = True):i
f entry.unique and store_frequencies:self.write_id('%s|%s' % (entry.id, 'frequen
cy:%d' % len(entry.i
>temp_249|source:-*- coding: utf-8 -*-|start:1864|stop:2364
niqueself.unique_hash_dict = {}self.unique_hash_list = []self.unique_next_hash =
 0self.file_pointer = open(self.fasta_file_path)self.file_pointer.seek(0)if self
.lazy_init:self.total_seq = Noneelse:self.total_seq = len([l for l in self.file_
pointer.readlines() if l.startswith('>')])self.reset()if self.unique:self.init_u
nique_hash()def init_unique_hash(self):while self.next_regular():hash = hashlib.
sha1(self.seq.upper()).hexdigest()if hash in self.unique_hash_dict:self.unique_h
ash_dict[hash]['ids'
>temp_250|source:-*- coding: utf-8 -*-|start:5369|stop:5869
 max_seq_len)plt.ylim(ymin = 0, ymax = max(seq_len_distribution) + (max(seq_len_
distribution) / 20.0))plt.figtext(0.5, 0.96, '%s' % (title), weight = 'black', s
ize = 'xx-large', ha = 'center')ax1 = plt.subplot(gs[9])plt.rcParams.update({'ax
es.edgecolor' : 20})plt.grid(False)plt.yticks([])plt.xticks([])plt.text(0.02, 0.
5, 'total: %s / mean: %.2f / std: %.2f / min: %s / max: %s'\% (len(sequence_leng
ths),numpy.mean(sequence_lengths), numpy.std(sequence_lengths),\min(sequence_len
gths),\max(sequence_
>temp_251|source:-*- coding: utf-8 -*-|start:5448|stop:5948
_distribution) / 20.0))plt.figtext(0.5, 0.96, '%s' % (title), weight = 'black', 
size = 'xx-large', ha = 'center')ax1 = plt.subplot(gs[9])plt.rcParams.update({'a
xes.edgecolor' : 20})plt.grid(False)plt.yticks([])plt.xticks([])plt.text(0.02, 0
.5, 'total: %s / mean: %.2f / std: %.2f / min: %s / max: %s'\% (len(sequence_len
gths),numpy.mean(sequence_lengths), numpy.std(sequence_lengths),\min(sequence_le
ngths),\max(sequence_lengths)),\va = 'center', alpha = 0.8, size = 'x-large')if 
dest == None:dest = 
>temp_252|source:-*- coding: utf-8 -*-|start:4691|stop:5191
rue)plt.subplots_adjust(left=0.05, bottom = 0.03, top = 0.95, right = 0.98)plt.p
lot(seq_len_distribution, color = 'black', alpha = 0.3)plt.fill_between(range(0,
 max_seq_len + 1), seq_len_distribution, y2 = 0, color = 'black', alpha = 0.15)p
lt.ylabel('number of sequences')plt.xlabel('sequence length')if xtickstep == Non
e:xtickstep = (max_seq_len / 50) or 1if ytickstep == None:ytickstep = max(seq_le
n_distribution) / 20 or 1plt.xticks(range(xtickstep, max_seq_len + 1, xtickstep)
, rotation=90, size=
>temp_253|source:-*- coding: utf-8 -*-|start:3094|stop:3594
f.ids = hash_entry['ids']return Trueelse:return Falseelse:return Falsedef next_r
egular(self):self.seq = Noneself.id = self.file_pointer.readline()[1:].strip()se
quence = ''while 1:line = self.file_pointer.readline()if not line:if len(sequenc
e):self.seq = sequenceself.pos += 1return Trueelse:return Falseif line.startswit
h('>'):self.file_pointer.seek(self.file_pointer.tell() - len(line))breaksequence
 += line.strip()self.seq = sequence if self.allow_mixed_case else sequence.upper
()self.pos += 1retur
>temp_254|source:-*- coding: utf-8 -*-|start:2632|stop:3132
_hash_dict], reverse = True)]self.total_unique = len(self.unique_hash_dict)self.
reset()def next(self):if self.unique:return self.next_unique()else:return self.n
ext_regular()def next_unique(self):if self.unique:if self.total_unique > 0 and s
elf.pos < self.total_unique:hash_entry = self.unique_hash_dict[self.unique_hash_
list[self.pos]]self.pos += 1self.seq = hash_entry['seq'] if self.allow_mixed_cas
e else hash_entry['seq'].upper()self.id  = hash_entry['id']self.ids = hash_entry
['ids']return Trueel
>temp_255|source:-*- coding: utf-8 -*-|start:487|stop:987
utput_file_path, 'w')def store(self, entry, split = True, store_frequencies = Tr
ue):if entry.unique and store_frequencies:self.write_id('%s|%s' % (entry.id, 'fr
equency:%d' % len(entry.ids)))else:self.write_id(entry.id)self.write_seq(entry.s
eq, split)def write_id(self, id):self.output_file_obj.write('>%s\n' % id)def wri
te_seq(self, seq, split = True):if split:seq = self.split(seq)self.output_file_o
bj.write('%s\n' % seq)def split(self, sequence, piece_length = 80):ticks = range
(0, len(sequence), p
>temp_256|source:-*- coding: utf-8 -*-|start:3449|stop:3949
file_pointer.tell() - len(line))breaksequence += line.strip()self.seq = sequence
 if self.allow_mixed_case else sequence.upper()self.pos += 1return Truedef get_s
eq_by_read_id(self, read_id):self.reset()while self.next():if self.id == read_id
:return self.seqreturn Falsedef close(self):self.file_pointer.close()def reset(s
elf):self.pos = 0self.id  = Noneself.seq = Noneself.ids = []self.file_pointer.se
ek(0)def visualize_sequence_length_distribution(self, title, dest = None, max_se
q_len = None, xticks
>temp_257|source:-*- coding: utf-8 -*-|start:1004|stop:1504
en(sequence)]return '\n'.join([sequence[ticks[x]:ticks[x + 1]] for x in range(0,
 len(ticks) - 1)])def close(self):self.output_file_obj.close()class ReadFasta:de
f __init__(self, f_name):self.ids = []self.sequences = []self.fasta = SequenceSo
urce(f_name)while self.fasta.next():if self.fasta.pos % 1000 == 0 or self.fasta.
pos == 1:sys.stderr.write('\r[fastalib] Reading FASTA into memory: %s' % (self.f
asta.pos))sys.stderr.flush()self.ids.append(self.fasta.id)self.sequences.append(
self.fasta.seq)sys.s
>temp_258|source:-*- coding: utf-8 -*-|start:3837|stop:4337
e_pointer.seek(0)def visualize_sequence_length_distribution(self, title, dest = 
None, max_seq_len = None, xtickstep = None, ytickstep = None):import matplotlib.
pyplot as pltimport matplotlib.gridspec as gridspecsequence_lengths = []self.res
et()while self.next():if self.pos % 10000 == 0 or self.pos == 1:sys.stderr.write
('\r[fastalib] Reading: %s' % (self.pos))sys.stderr.flush()sequence_lengths.appe
nd(len(self.seq))self.reset()sys.stderr.write('\n')if not max_seq_len:max_seq_le
n = max(sequence_len
>temp_259|source:-*- coding: utf-8 -*-|start:4284|stop:4784
\n')if not max_seq_len:max_seq_len = max(sequence_lengths) + (int(max(sequence_l
engths) / 100.0) or 10)seq_len_distribution = [0] * (max_seq_len + 1)for l in se
quence_lengths:seq_len_distribution[l] += 1fig = plt.figure(figsize = (16, 12))p
lt.rcParams.update({'axes.linewidth' : 0.9})plt.rc('grid', color='0.50', linesty
le='-', linewidth=0.1)gs = gridspec.GridSpec(10, 1)ax1 = plt.subplot(gs[0:8])plt
.grid(True)plt.subplots_adjust(left=0.05, bottom = 0.03, top = 0.95, right = 0.9
8)plt.plot(seq_len_d
>temp_260|source:-*- coding: utf-8 -*-|start:3282|stop:3782
lf.file_pointer.readline()if not line:if len(sequence):self.seq = sequenceself.p
os += 1return Trueelse:return Falseif line.startswith('>'):self.file_pointer.see
k(self.file_pointer.tell() - len(line))breaksequence += line.strip()self.seq = s
equence if self.allow_mixed_case else sequence.upper()self.pos += 1return Truede
f get_seq_by_read_id(self, read_id):self.reset()while self.next():if self.id == 
read_id:return self.seqreturn Falsedef close(self):self.file_pointer.close()def 
reset(self):self.pos
>temp_261|source:-*- coding: utf-8 -*-|start:417|stop:917
:self.output_file_path = output_file_pathself.output_file_obj = open(output_file
_path, 'w')def store(self, entry, split = True, store_frequencies = True):if ent
ry.unique and store_frequencies:self.write_id('%s|%s' % (entry.id, 'frequency:%d
' % len(entry.ids)))else:self.write_id(entry.id)self.write_seq(entry.seq, split)
def write_id(self, id):self.output_file_obj.write('>%s\n' % id)def write_seq(sel
f, seq, split = True):if split:seq = self.split(seq)self.output_file_obj.write('
%s\n' % seq)def spli
>temp_262|source:-*- coding: utf-8 -*-|start:3112|stop:3612
['ids']return Trueelse:return Falseelse:return Falsedef next_regular(self):self.
seq = Noneself.id = self.file_pointer.readline()[1:].strip()sequence = ''while 1
:line = self.file_pointer.readline()if not line:if len(sequence):self.seq = sequ
enceself.pos += 1return Trueelse:return Falseif line.startswith('>'):self.file_p
ointer.seek(self.file_pointer.tell() - len(line))breaksequence += line.strip()se
lf.seq = sequence if self.allow_mixed_case else sequence.upper()self.pos += 1ret
urn Truedef get_seq_
>temp_263|source:-*- coding: utf-8 -*-|start:3342|stop:3842
seq = sequenceself.pos += 1return Trueelse:return Falseif line.startswith('>'):s
elf.file_pointer.seek(self.file_pointer.tell() - len(line))breaksequence += line
.strip()self.seq = sequence if self.allow_mixed_case else sequence.upper()self.p
os += 1return Truedef get_seq_by_read_id(self, read_id):self.reset()while self.n
ext():if self.id == read_id:return self.seqreturn Falsedef close(self):self.file
_pointer.close()def reset(self):self.pos = 0self.id  = Noneself.seq = Noneself.i
ds = []self.file_poi
>temp_264|source:-*- coding: utf-8 -*-|start:3860|stop:4360
sualize_sequence_length_distribution(self, title, dest = None, max_seq_len = Non
e, xtickstep = None, ytickstep = None):import matplotlib.pyplot as pltimport mat
plotlib.gridspec as gridspecsequence_lengths = []self.reset()while self.next():i
f self.pos % 10000 == 0 or self.pos == 1:sys.stderr.write('\r[fastalib] Reading:
 %s' % (self.pos))sys.stderr.flush()sequence_lengths.append(len(self.seq))self.r
eset()sys.stderr.write('\n')if not max_seq_len:max_seq_len = max(sequence_length
s) + (int(max(sequen
>temp_265|source:-*- coding: utf-8 -*-|start:6798|stop:7298
turn Trueelse:return Falseif line.startswith('>'):self.file_pointer.seek(self.fi
le_pointer.tell() - len(line))breakqualscores += ' ' + line.strip()self.quals = 
qualscores.strip()self.quals_int = [int(q) for q in self.quals.split()]self.pos 
+= 1return Truedef close(self):self.file_pointer.close()def reset(self):self.pos
 = 0self.id  = Noneself.quals = Noneself.quals_int = Noneself.ids = []self.file_
pointer.seek(0)if __name__ == '__main__':fasta = SequenceSource(sys.argv[1])fast
a.visualize_sequence
>temp_266|source:-*- coding: utf-8 -*-|start:649|stop:1149
uency:%d' % len(entry.ids)))else:self.write_id(entry.id)self.write_seq(entry.seq
, split)def write_id(self, id):self.output_file_obj.write('>%s\n' % id)def write
_seq(self, seq, split = True):if split:seq = self.split(seq)self.output_file_obj
.write('%s\n' % seq)def split(self, sequence, piece_length = 80):ticks = range(0
, len(sequence), piece_length) + [len(sequence)]return '\n'.join([sequence[ticks
[x]:ticks[x + 1]] for x in range(0, len(ticks) - 1)])def close(self):self.output
_file_obj.close()cla
>temp_267|source:-*- coding: utf-8 -*-|start:4820|stop:5320
= 0.3)plt.fill_between(range(0, max_seq_len + 1), seq_len_distribution, y2 = 0, 
color = 'black', alpha = 0.15)plt.ylabel('number of sequences')plt.xlabel('seque
nce length')if xtickstep == None:xtickstep = (max_seq_len / 50) or 1if ytickstep
 == None:ytickstep = max(seq_len_distribution) / 20 or 1plt.xticks(range(xtickst
ep, max_seq_len + 1, xtickstep), rotation=90, size='xx-small')plt.yticks(range(0
, max(seq_len_distribution) + 1, ytickstep),[y for y in range(0, max(seq_len_dis
tribution) + 1, ytic
>temp_268|source:-*- coding: utf-8 -*-|start:5610|stop:6110
s.edgecolor' : 20})plt.grid(False)plt.yticks([])plt.xticks([])plt.text(0.02, 0.5
, 'total: %s / mean: %.2f / std: %.2f / min: %s / max: %s'\% (len(sequence_lengt
hs),numpy.mean(sequence_lengths), numpy.std(sequence_lengths),\min(sequence_leng
ths),\max(sequence_lengths)),\va = 'center', alpha = 0.8, size = 'x-large')if de
st == None:dest = self.fasta_file_pathtry:plt.savefig(dest + '.pdf')except:plt.s
avefig(dest + '.png')try:plt.show()except:passreturnclass QualSource:def __init_
_(self, quals_file_p
>temp_269|source:-*- coding: utf-8 -*-|start:1570|stop:2070
ource():def __init__(self, fasta_file_path, lazy_init = True, unique = False, al
low_mixed_case = False):self.fasta_file_path = fasta_file_pathself.name = Nonese
lf.lazy_init = lazy_initself.allow_mixed_case = allow_mixed_caseself.pos = 0self
.id  = Noneself.seq = Noneself.ids = []self.unique = uniqueself.unique_hash_dict
 = {}self.unique_hash_list = []self.unique_next_hash = 0self.file_pointer = open
(self.fasta_file_path)self.file_pointer.seek(0)if self.lazy_init:self.total_seq 
= Noneelse:self.tota
>temp_270|source:-*- coding: utf-8 -*-|start:760|stop:1260
self.output_file_obj.write('>%s\n' % id)def write_seq(self, seq, split = True):i
f split:seq = self.split(seq)self.output_file_obj.write('%s\n' % seq)def split(s
elf, sequence, piece_length = 80):ticks = range(0, len(sequence), piece_length) 
+ [len(sequence)]return '\n'.join([sequence[ticks[x]:ticks[x + 1]] for x in rang
e(0, len(ticks) - 1)])def close(self):self.output_file_obj.close()class ReadFast
a:def __init__(self, f_name):self.ids = []self.sequences = []self.fasta = Sequen
ceSource(f_name)whil
>temp_271|source:-*- coding: utf-8 -*-|start:6320|stop:6820
.quals_file_path)self.file_pointer.seek(0)if self.lazy_init:self.total_quals = N
oneelse:self.total_quals = len([l for l in self.file_pointer.readlines() if l.st
artswith('>')])self.reset()def next(self):self.id = self.file_pointer.readline()
[1:].strip()self.quals = Noneself.quals_int = Nonequalscores = ''while 1:line = 
self.file_pointer.readline()if not line:if len(qualscores):self.quals = qualscor
es.strip()self.quals_int = [int(q) for q in self.quals.split()]self.pos += 1retu
rn Trueelse:return F
>temp_272|source:-*- coding: utf-8 -*-|start:3253|stop:3753
equence = ''while 1:line = self.file_pointer.readline()if not line:if len(sequen
ce):self.seq = sequenceself.pos += 1return Trueelse:return Falseif line.startswi
th('>'):self.file_pointer.seek(self.file_pointer.tell() - len(line))breaksequenc
e += line.strip()self.seq = sequence if self.allow_mixed_case else sequence.uppe
r()self.pos += 1return Truedef get_seq_by_read_id(self, read_id):self.reset()whi
le self.next():if self.id == read_id:return self.seqreturn Falsedef close(self):
self.file_pointer.cl
>temp_273|source:-*- coding: utf-8 -*-|start:55|stop:555
ogram is free software; you can redistribute it and/or modify it under# the term
s of the GNU General Public License as published by the Free# Software Foundatio
n; either version 2 of the License, or (at your option)# any later version.## Pl
ease read the docs/COPYING file.import sysimport numpyimport hashlibclass FastaO
utput:def __init__(self, output_file_path):self.output_file_path = output_file_p
athself.output_file_obj = open(output_file_path, 'w')def store(self, entry, spli
t = True, store_freq
>temp_274|source:-*- coding: utf-8 -*-|start:4339|stop:4839
hs) + (int(max(sequence_lengths) / 100.0) or 10)seq_len_distribution = [0] * (ma
x_seq_len + 1)for l in sequence_lengths:seq_len_distribution[l] += 1fig = plt.fi
gure(figsize = (16, 12))plt.rcParams.update({'axes.linewidth' : 0.9})plt.rc('gri
d', color='0.50', linestyle='-', linewidth=0.1)gs = gridspec.GridSpec(10, 1)ax1 
= plt.subplot(gs[0:8])plt.grid(True)plt.subplots_adjust(left=0.05, bottom = 0.03
, top = 0.95, right = 0.98)plt.plot(seq_len_distribution, color = 'black', alpha
 = 0.3)plt.fill_betw
>temp_275|source:-*- coding: utf-8 -*-|start:4810|stop:5310
k', alpha = 0.3)plt.fill_between(range(0, max_seq_len + 1), seq_len_distribution
, y2 = 0, color = 'black', alpha = 0.15)plt.ylabel('number of sequences')plt.xla
bel('sequence length')if xtickstep == None:xtickstep = (max_seq_len / 50) or 1if
 ytickstep == None:ytickstep = max(seq_len_distribution) / 20 or 1plt.xticks(ran
ge(xtickstep, max_seq_len + 1, xtickstep), rotation=90, size='xx-small')plt.ytic
ks(range(0, max(seq_len_distribution) + 1, ytickstep),[y for y in range(0, max(s
eq_len_distribution)
>temp_276|source:-*- coding: utf-8 -*-|start:2673|stop:3173
nique = len(self.unique_hash_dict)self.reset()def next(self):if self.unique:retu
rn self.next_unique()else:return self.next_regular()def next_unique(self):if sel
f.unique:if self.total_unique > 0 and self.pos < self.total_unique:hash_entry = 
self.unique_hash_dict[self.unique_hash_list[self.pos]]self.pos += 1self.seq = ha
sh_entry['seq'] if self.allow_mixed_case else hash_entry['seq'].upper()self.id  
= hash_entry['id']self.ids = hash_entry['ids']return Trueelse:return Falseelse:r
eturn Falsedef next_
>temp_277|source:-*- coding: utf-8 -*-|start:3187|stop:3687
self.seq = Noneself.id = self.file_pointer.readline()[1:].strip()sequence = ''wh
ile 1:line = self.file_pointer.readline()if not line:if len(sequence):self.seq =
 sequenceself.pos += 1return Trueelse:return Falseif line.startswith('>'):self.f
ile_pointer.seek(self.file_pointer.tell() - len(line))breaksequence += line.stri
p()self.seq = sequence if self.allow_mixed_case else sequence.upper()self.pos +=
 1return Truedef get_seq_by_read_id(self, read_id):self.reset()while self.next()
:if self.id == read_
>temp_278|source:-*- coding: utf-8 -*-|start:1903|stop:2403
que_hash_list = []self.unique_next_hash = 0self.file_pointer = open(self.fasta_f
ile_path)self.file_pointer.seek(0)if self.lazy_init:self.total_seq = Noneelse:se
lf.total_seq = len([l for l in self.file_pointer.readlines() if l.startswith('>'
)])self.reset()if self.unique:self.init_unique_hash()def init_unique_hash(self):
while self.next_regular():hash = hashlib.sha1(self.seq.upper()).hexdigest()if ha
sh in self.unique_hash_dict:self.unique_hash_dict[hash]['ids'].append(self.id)se
lf.unique_hash_dict[
>temp_279|source:-*- coding: utf-8 -*-|start:4235|stop:4735
pend(len(self.seq))self.reset()sys.stderr.write('\n')if not max_seq_len:max_seq_
len = max(sequence_lengths) + (int(max(sequence_lengths) / 100.0) or 10)seq_len_
distribution = [0] * (max_seq_len + 1)for l in sequence_lengths:seq_len_distribu
tion[l] += 1fig = plt.figure(figsize = (16, 12))plt.rcParams.update({'axes.linew
idth' : 0.9})plt.rc('grid', color='0.50', linestyle='-', linewidth=0.1)gs = grid
spec.GridSpec(10, 1)ax1 = plt.subplot(gs[0:8])plt.grid(True)plt.subplots_adjust(
left=0.05, bottom = 
>temp_280|source:-*- coding: utf-8 -*-|start:152|stop:652
ral Public License as published by the Free# Software Foundation; either version
 2 of the License, or (at your option)# any later version.## Please read the doc
s/COPYING file.import sysimport numpyimport hashlibclass FastaOutput:def __init_
_(self, output_file_path):self.output_file_path = output_file_pathself.output_fi
le_obj = open(output_file_path, 'w')def store(self, entry, split = True, store_f
requencies = True):if entry.unique and store_frequencies:self.write_id('%s|%s' %
 (entry.id, 'frequen
>temp_281|source:-*- coding: utf-8 -*-|start:4976|stop:5476
equence length')if xtickstep == None:xtickstep = (max_seq_len / 50) or 1if ytick
step == None:ytickstep = max(seq_len_distribution) / 20 or 1plt.xticks(range(xti
ckstep, max_seq_len + 1, xtickstep), rotation=90, size='xx-small')plt.yticks(ran
ge(0, max(seq_len_distribution) + 1, ytickstep),[y for y in range(0, max(seq_len
_distribution) + 1, ytickstep)],size='xx-small')plt.xlim(xmin = 0, xmax = max_se
q_len)plt.ylim(ymin = 0, ymax = max(seq_len_distribution) + (max(seq_len_distrib
ution) / 20.0))plt.f
>temp_282|source:-*- coding: utf-8 -*-|start:3179|stop:3679
r(self):self.seq = Noneself.id = self.file_pointer.readline()[1:].strip()sequenc
e = ''while 1:line = self.file_pointer.readline()if not line:if len(sequence):se
lf.seq = sequenceself.pos += 1return Trueelse:return Falseif line.startswith('>'
):self.file_pointer.seek(self.file_pointer.tell() - len(line))breaksequence += l
ine.strip()self.seq = sequence if self.allow_mixed_case else sequence.upper()sel
f.pos += 1return Truedef get_seq_by_read_id(self, read_id):self.reset()while sel
f.next():if self.id 
>temp_283|source:-*- coding: utf-8 -*-|start:3988|stop:4488
tplotlib.pyplot as pltimport matplotlib.gridspec as gridspecsequence_lengths = [
]self.reset()while self.next():if self.pos % 10000 == 0 or self.pos == 1:sys.std
err.write('\r[fastalib] Reading: %s' % (self.pos))sys.stderr.flush()sequence_len
gths.append(len(self.seq))self.reset()sys.stderr.write('\n')if not max_seq_len:m
ax_seq_len = max(sequence_lengths) + (int(max(sequence_lengths) / 100.0) or 10)s
eq_len_distribution = [0] * (max_seq_len + 1)for l in sequence_lengths:seq_len_d
istribution[l] += 1f
>temp_284|source:-*- coding: utf-8 -*-|start:3929|stop:4429
q_len = None, xtickstep = None, ytickstep = None):import matplotlib.pyplot as pl
timport matplotlib.gridspec as gridspecsequence_lengths = []self.reset()while se
lf.next():if self.pos % 10000 == 0 or self.pos == 1:sys.stderr.write('\r[fastali
b] Reading: %s' % (self.pos))sys.stderr.flush()sequence_lengths.append(len(self.
seq))self.reset()sys.stderr.write('\n')if not max_seq_len:max_seq_len = max(sequ
ence_lengths) + (int(max(sequence_lengths) / 100.0) or 10)seq_len_distribution =
 [0] * (max_seq_len 
>temp_285|source:-*- coding: utf-8 -*-|start:141|stop:641
he GNU General Public License as published by the Free# Software Foundation; eit
her version 2 of the License, or (at your option)# any later version.## Please r
ead the docs/COPYING file.import sysimport numpyimport hashlibclass FastaOutput:
def __init__(self, output_file_path):self.output_file_path = output_file_pathsel
f.output_file_obj = open(output_file_path, 'w')def store(self, entry, split = Tr
ue, store_frequencies = True):if entry.unique and store_frequencies:self.write_i
d('%s|%s' % (entry.i
>temp_286|source:-*- coding: utf-8 -*-|start:861|stop:1361
lit(seq)self.output_file_obj.write('%s\n' % seq)def split(self, sequence, piece_
length = 80):ticks = range(0, len(sequence), piece_length) + [len(sequence)]retu
rn '\n'.join([sequence[ticks[x]:ticks[x + 1]] for x in range(0, len(ticks) - 1)]
)def close(self):self.output_file_obj.close()class ReadFasta:def __init__(self, 
f_name):self.ids = []self.sequences = []self.fasta = SequenceSource(f_name)while
 self.fasta.next():if self.fasta.pos % 1000 == 0 or self.fasta.pos == 1:sys.stde
rr.write('\r[fastali
>temp_287|source:-*- coding: utf-8 -*-|start:1727|stop:2227
eself.lazy_init = lazy_initself.allow_mixed_case = allow_mixed_caseself.pos = 0s
elf.id  = Noneself.seq = Noneself.ids = []self.unique = uniqueself.unique_hash_d
ict = {}self.unique_hash_list = []self.unique_next_hash = 0self.file_pointer = o
pen(self.fasta_file_path)self.file_pointer.seek(0)if self.lazy_init:self.total_s
eq = Noneelse:self.total_seq = len([l for l in self.file_pointer.readlines() if 
l.startswith('>')])self.reset()if self.unique:self.init_unique_hash()def init_un
ique_hash(self):whil
>temp_288|source:-*- coding: utf-8 -*-|start:89|stop:589
distribute it and/or modify it under# the terms of the GNU General Public Licens
e as published by the Free# Software Foundation; either version 2 of the License
, or (at your option)# any later version.## Please read the docs/COPYING file.im
port sysimport numpyimport hashlibclass FastaOutput:def __init__(self, output_fi
le_path):self.output_file_path = output_file_pathself.output_file_obj = open(out
put_file_path, 'w')def store(self, entry, split = True, store_frequencies = True
):if entry.unique an
>temp_289|source:-*- coding: utf-8 -*-|start:3170|stop:3670
xt_regular(self):self.seq = Noneself.id = self.file_pointer.readline()[1:].strip
()sequence = ''while 1:line = self.file_pointer.readline()if not line:if len(seq
uence):self.seq = sequenceself.pos += 1return Trueelse:return Falseif line.start
swith('>'):self.file_pointer.seek(self.file_pointer.tell() - len(line))breaksequ
ence += line.strip()self.seq = sequence if self.allow_mixed_case else sequence.u
pper()self.pos += 1return Truedef get_seq_by_read_id(self, read_id):self.reset()
while self.next():if
>temp_290|source:-*- coding: utf-8 -*-|start:3073|stop:3573
= hash_entry['id']self.ids = hash_entry['ids']return Trueelse:return Falseelse:r
eturn Falsedef next_regular(self):self.seq = Noneself.id = self.file_pointer.rea
dline()[1:].strip()sequence = ''while 1:line = self.file_pointer.readline()if no
t line:if len(sequence):self.seq = sequenceself.pos += 1return Trueelse:return F
alseif line.startswith('>'):self.file_pointer.seek(self.file_pointer.tell() - le
n(line))breaksequence += line.strip()self.seq = sequence if self.allow_mixed_cas
e else sequence.uppe
>temp_291|source:-*- coding: utf-8 -*-|start:961|stop:1461
 range(0, len(sequence), piece_length) + [len(sequence)]return '\n'.join([sequen
ce[ticks[x]:ticks[x + 1]] for x in range(0, len(ticks) - 1)])def close(self):sel
f.output_file_obj.close()class ReadFasta:def __init__(self, f_name):self.ids = [
]self.sequences = []self.fasta = SequenceSource(f_name)while self.fasta.next():i
f self.fasta.pos % 1000 == 0 or self.fasta.pos == 1:sys.stderr.write('\r[fastali
b] Reading FASTA into memory: %s' % (self.fasta.pos))sys.stderr.flush()self.ids.
append(self.fasta.id
>temp_292|source:-*- coding: utf-8 -*-|start:1565|stop:2065
enceSource():def __init__(self, fasta_file_path, lazy_init = True, unique = Fals
e, allow_mixed_case = False):self.fasta_file_path = fasta_file_pathself.name = N
oneself.lazy_init = lazy_initself.allow_mixed_case = allow_mixed_caseself.pos = 
0self.id  = Noneself.seq = Noneself.ids = []self.unique = uniqueself.unique_hash
_dict = {}self.unique_hash_list = []self.unique_next_hash = 0self.file_pointer =
 open(self.fasta_file_path)self.file_pointer.seek(0)if self.lazy_init:self.total
_seq = Noneelse:self
>temp_293|source:-*- coding: utf-8 -*-|start:4647|stop:5147
c(10, 1)ax1 = plt.subplot(gs[0:8])plt.grid(True)plt.subplots_adjust(left=0.05, b
ottom = 0.03, top = 0.95, right = 0.98)plt.plot(seq_len_distribution, color = 'b
lack', alpha = 0.3)plt.fill_between(range(0, max_seq_len + 1), seq_len_distribut
ion, y2 = 0, color = 'black', alpha = 0.15)plt.ylabel('number of sequences')plt.
xlabel('sequence length')if xtickstep == None:xtickstep = (max_seq_len / 50) or 
1if ytickstep == None:ytickstep = max(seq_len_distribution) / 20 or 1plt.xticks(
range(xtickstep, max
>temp_294|source:-*- coding: utf-8 -*-|start:1758|stop:2258
.allow_mixed_case = allow_mixed_caseself.pos = 0self.id  = Noneself.seq = Nonese
lf.ids = []self.unique = uniqueself.unique_hash_dict = {}self.unique_hash_list =
 []self.unique_next_hash = 0self.file_pointer = open(self.fasta_file_path)self.f
ile_pointer.seek(0)if self.lazy_init:self.total_seq = Noneelse:self.total_seq = 
len([l for l in self.file_pointer.readlines() if l.startswith('>')])self.reset()
if self.unique:self.init_unique_hash()def init_unique_hash(self):while self.next
_regular():hash = ha
>temp_295|source:-*- coding: utf-8 -*-|start:2952|stop:3452
list[self.pos]]self.pos += 1self.seq = hash_entry['seq'] if self.allow_mixed_cas
e else hash_entry['seq'].upper()self.id  = hash_entry['id']self.ids = hash_entry
['ids']return Trueelse:return Falseelse:return Falsedef next_regular(self):self.
seq = Noneself.id = self.file_pointer.readline()[1:].strip()sequence = ''while 1
:line = self.file_pointer.readline()if not line:if len(sequence):self.seq = sequ
enceself.pos += 1return Trueelse:return Falseif line.startswith('>'):self.file_p
ointer.seek(self.fil
>temp_296|source:-*- coding: utf-8 -*-|start:4363|stop:4863
lengths) / 100.0) or 10)seq_len_distribution = [0] * (max_seq_len + 1)for l in s
equence_lengths:seq_len_distribution[l] += 1fig = plt.figure(figsize = (16, 12))
plt.rcParams.update({'axes.linewidth' : 0.9})plt.rc('grid', color='0.50', linest
yle='-', linewidth=0.1)gs = gridspec.GridSpec(10, 1)ax1 = plt.subplot(gs[0:8])pl
t.grid(True)plt.subplots_adjust(left=0.05, bottom = 0.03, top = 0.95, right = 0.
98)plt.plot(seq_len_distribution, color = 'black', alpha = 0.3)plt.fill_between(
range(0, max_seq_len
>temp_297|source:-*- coding: utf-8 -*-|start:5052|stop:5552
tickstep == None:ytickstep = max(seq_len_distribution) / 20 or 1plt.xticks(range
(xtickstep, max_seq_len + 1, xtickstep), rotation=90, size='xx-small')plt.yticks
(range(0, max(seq_len_distribution) + 1, ytickstep),[y for y in range(0, max(seq
_len_distribution) + 1, ytickstep)],size='xx-small')plt.xlim(xmin = 0, xmax = ma
x_seq_len)plt.ylim(ymin = 0, ymax = max(seq_len_distribution) + (max(seq_len_dis
tribution) / 20.0))plt.figtext(0.5, 0.96, '%s' % (title), weight = 'black', size
 = 'xx-large', ha = 
>temp_298|source:-*- coding: utf-8 -*-|start:662|stop:1162
en(entry.ids)))else:self.write_id(entry.id)self.write_seq(entry.seq, split)def w
rite_id(self, id):self.output_file_obj.write('>%s\n' % id)def write_seq(self, se
q, split = True):if split:seq = self.split(seq)self.output_file_obj.write('%s\n'
 % seq)def split(self, sequence, piece_length = 80):ticks = range(0, len(sequenc
e), piece_length) + [len(sequence)]return '\n'.join([sequence[ticks[x]:ticks[x +
 1]] for x in range(0, len(ticks) - 1)])def close(self):self.output_file_obj.clo
se()class ReadFasta:
>temp_299|source:-*- coding: utf-8 -*-|start:6865|stop:7365
.seek(self.file_pointer.tell() - len(line))breakqualscores += ' ' + line.strip()
self.quals = qualscores.strip()self.quals_int = [int(q) for q in self.quals.spli
t()]self.pos += 1return Truedef close(self):self.file_pointer.close()def reset(s
elf):self.pos = 0self.id  = Noneself.quals = Noneself.quals_int = Noneself.ids =
 []self.file_pointer.seek(0)if __name__ == '__main__':fasta = SequenceSource(sys
.argv[1])fasta.visualize_sequence_length_distribution(title = sys.argv[2] if len
(sys.argv) == 3 else
>temp_300|source:-*- coding: utf-8 -*-|start:3470|stop:3970
 len(line))breaksequence += line.strip()self.seq = sequence if self.allow_mixed_
case else sequence.upper()self.pos += 1return Truedef get_seq_by_read_id(self, r
ead_id):self.reset()while self.next():if self.id == read_id:return self.seqretur
n Falsedef close(self):self.file_pointer.close()def reset(self):self.pos = 0self
.id  = Noneself.seq = Noneself.ids = []self.file_pointer.seek(0)def visualize_se
quence_length_distribution(self, title, dest = None, max_seq_len = None, xtickst
ep = None, ytickstep
>temp_301|source:-*- coding: utf-8 -*-|start:5919|stop:6419
arge')if dest == None:dest = self.fasta_file_pathtry:plt.savefig(dest + '.pdf')e
xcept:plt.savefig(dest + '.png')try:plt.show()except:passreturnclass QualSource:
def __init__(self, quals_file_path, lazy_init = True):self.quals_file_path = qua
ls_file_pathself.name = Noneself.lazy_init = lazy_initself.pos = 0self.id  = Non
eself.quals = Noneself.quals_int = Noneself.ids = []self.file_pointer = open(sel
f.quals_file_path)self.file_pointer.seek(0)if self.lazy_init:self.total_quals = 
Noneelse:self.total_
>temp_302|source:-*- coding: utf-8 -*-|start:1204|stop:1704
lf.sequences = []self.fasta = SequenceSource(f_name)while self.fasta.next():if s
elf.fasta.pos % 1000 == 0 or self.fasta.pos == 1:sys.stderr.write('\r[fastalib] 
Reading FASTA into memory: %s' % (self.fasta.pos))sys.stderr.flush()self.ids.app
end(self.fasta.id)self.sequences.append(self.fasta.seq)sys.stderr.write('\n')def
 close(self):self.fasta.close()class SequenceSource():def __init__(self, fasta_f
ile_path, lazy_init = True, unique = False, allow_mixed_case = False):self.fasta
_file_path = fasta_f
>temp_303|source:-*- coding: utf-8 -*-|start:3564|stop:4064
ence.upper()self.pos += 1return Truedef get_seq_by_read_id(self, read_id):self.r
eset()while self.next():if self.id == read_id:return self.seqreturn Falsedef clo
se(self):self.file_pointer.close()def reset(self):self.pos = 0self.id  = Nonesel
f.seq = Noneself.ids = []self.file_pointer.seek(0)def visualize_sequence_length_
distribution(self, title, dest = None, max_seq_len = None, xtickstep = None, yti
ckstep = None):import matplotlib.pyplot as pltimport matplotlib.gridspec as grid
specsequence_lengths
>temp_304|source:-*- coding: utf-8 -*-|start:5321|stop:5821
step)],size='xx-small')plt.xlim(xmin = 0, xmax = max_seq_len)plt.ylim(ymin = 0, 
ymax = max(seq_len_distribution) + (max(seq_len_distribution) / 20.0))plt.figtex
t(0.5, 0.96, '%s' % (title), weight = 'black', size = 'xx-large', ha = 'center')
ax1 = plt.subplot(gs[9])plt.rcParams.update({'axes.edgecolor' : 20})plt.grid(Fal
se)plt.yticks([])plt.xticks([])plt.text(0.02, 0.5, 'total: %s / mean: %.2f / std
: %.2f / min: %s / max: %s'\% (len(sequence_lengths),numpy.mean(sequence_lengths
), numpy.std(sequenc
>temp_305|source:-*- coding: utf-8 -*-|start:2865|stop:3365
0 and self.pos < self.total_unique:hash_entry = self.unique_hash_dict[self.uniqu
e_hash_list[self.pos]]self.pos += 1self.seq = hash_entry['seq'] if self.allow_mi
xed_case else hash_entry['seq'].upper()self.id  = hash_entry['id']self.ids = has
h_entry['ids']return Trueelse:return Falseelse:return Falsedef next_regular(self
):self.seq = Noneself.id = self.file_pointer.readline()[1:].strip()sequence = ''
while 1:line = self.file_pointer.readline()if not line:if len(sequence):self.seq
 = sequenceself.pos 
>temp_306|source:-*- coding: utf-8 -*-|start:2491|stop:2991
seq': self.seq,'count': 1}self.unique_hash_list = [i[1] for i in sorted([(self.u
nique_hash_dict[hash]['count'], hash)\for hash in self.unique_hash_dict], revers
e = True)]self.total_unique = len(self.unique_hash_dict)self.reset()def next(sel
f):if self.unique:return self.next_unique()else:return self.next_regular()def ne
xt_unique(self):if self.unique:if self.total_unique > 0 and self.pos < self.tota
l_unique:hash_entry = self.unique_hash_dict[self.unique_hash_list[self.pos]]self
.pos += 1self.seq = 
>temp_307|source:-*- coding: utf-8 -*-|start:4603|stop:5103
yle='-', linewidth=0.1)gs = gridspec.GridSpec(10, 1)ax1 = plt.subplot(gs[0:8])pl
t.grid(True)plt.subplots_adjust(left=0.05, bottom = 0.03, top = 0.95, right = 0.
98)plt.plot(seq_len_distribution, color = 'black', alpha = 0.3)plt.fill_between(
range(0, max_seq_len + 1), seq_len_distribution, y2 = 0, color = 'black', alpha 
= 0.15)plt.ylabel('number of sequences')plt.xlabel('sequence length')if xtickste
p == None:xtickstep = (max_seq_len / 50) or 1if ytickstep == None:ytickstep = ma
x(seq_len_distributi
>temp_308|source:-*- coding: utf-8 -*-|start:821|stop:1321
eq, split = True):if split:seq = self.split(seq)self.output_file_obj.write('%s\n
' % seq)def split(self, sequence, piece_length = 80):ticks = range(0, len(sequen
ce), piece_length) + [len(sequence)]return '\n'.join([sequence[ticks[x]:ticks[x 
+ 1]] for x in range(0, len(ticks) - 1)])def close(self):self.output_file_obj.cl
ose()class ReadFasta:def __init__(self, f_name):self.ids = []self.sequences = []
self.fasta = SequenceSource(f_name)while self.fasta.next():if self.fasta.pos % 1
000 == 0 or self.fas
>temp_309|source:-*- coding: utf-8 -*-|start:5428|stop:5928
tion) + (max(seq_len_distribution) / 20.0))plt.figtext(0.5, 0.96, '%s' % (title)
, weight = 'black', size = 'xx-large', ha = 'center')ax1 = plt.subplot(gs[9])plt
.rcParams.update({'axes.edgecolor' : 20})plt.grid(False)plt.yticks([])plt.xticks
([])plt.text(0.02, 0.5, 'total: %s / mean: %.2f / std: %.2f / min: %s / max: %s'
\% (len(sequence_lengths),numpy.mean(sequence_lengths), numpy.std(sequence_lengt
hs),\min(sequence_lengths),\max(sequence_lengths)),\va = 'center', alpha = 0.8, 
size = 'x-large')if 
>temp_310|source:-*- coding: utf-8 -*-|start:1732|stop:2232
.lazy_init = lazy_initself.allow_mixed_case = allow_mixed_caseself.pos = 0self.i
d  = Noneself.seq = Noneself.ids = []self.unique = uniqueself.unique_hash_dict =
 {}self.unique_hash_list = []self.unique_next_hash = 0self.file_pointer = open(s
elf.fasta_file_path)self.file_pointer.seek(0)if self.lazy_init:self.total_seq = 
Noneelse:self.total_seq = len([l for l in self.file_pointer.readlines() if l.sta
rtswith('>')])self.reset()if self.unique:self.init_unique_hash()def init_unique_
hash(self):while sel
>temp_311|source:-*- coding: utf-8 -*-|start:4819|stop:5319
 = 0.3)plt.fill_between(range(0, max_seq_len + 1), seq_len_distribution, y2 = 0,
 color = 'black', alpha = 0.15)plt.ylabel('number of sequences')plt.xlabel('sequ
ence length')if xtickstep == None:xtickstep = (max_seq_len / 50) or 1if ytickste
p == None:ytickstep = max(seq_len_distribution) / 20 or 1plt.xticks(range(xticks
tep, max_seq_len + 1, xtickstep), rotation=90, size='xx-small')plt.yticks(range(
0, max(seq_len_distribution) + 1, ytickstep),[y for y in range(0, max(seq_len_di
stribution) + 1, yti
>temp_312|source:-*- coding: utf-8 -*-|start:4813|stop:5313
 alpha = 0.3)plt.fill_between(range(0, max_seq_len + 1), seq_len_distribution, y
2 = 0, color = 'black', alpha = 0.15)plt.ylabel('number of sequences')plt.xlabel
('sequence length')if xtickstep == None:xtickstep = (max_seq_len / 50) or 1if yt
ickstep == None:ytickstep = max(seq_len_distribution) / 20 or 1plt.xticks(range(
xtickstep, max_seq_len + 1, xtickstep), rotation=90, size='xx-small')plt.yticks(
range(0, max(seq_len_distribution) + 1, ytickstep),[y for y in range(0, max(seq_
len_distribution) + 
>temp_313|source:-*- coding: utf-8 -*-|start:3857|stop:4357
 visualize_sequence_length_distribution(self, title, dest = None, max_seq_len = 
None, xtickstep = None, ytickstep = None):import matplotlib.pyplot as pltimport 
matplotlib.gridspec as gridspecsequence_lengths = []self.reset()while self.next(
):if self.pos % 10000 == 0 or self.pos == 1:sys.stderr.write('\r[fastalib] Readi
ng: %s' % (self.pos))sys.stderr.flush()sequence_lengths.append(len(self.seq))sel
f.reset()sys.stderr.write('\n')if not max_seq_len:max_seq_len = max(sequence_len
gths) + (int(max(seq
>temp_314|source:-*- coding: utf-8 -*-|start:160|stop:660
ic License as published by the Free# Software Foundation; either version 2 of th
e License, or (at your option)# any later version.## Please read the docs/COPYIN
G file.import sysimport numpyimport hashlibclass FastaOutput:def __init__(self, 
output_file_path):self.output_file_path = output_file_pathself.output_file_obj =
 open(output_file_path, 'w')def store(self, entry, split = True, store_frequenci
es = True):if entry.unique and store_frequencies:self.write_id('%s|%s' % (entry.
id, 'frequency:%d' %
>temp_315|source:-*- coding: utf-8 -*-|start:1200|stop:1700
[]self.sequences = []self.fasta = SequenceSource(f_name)while self.fasta.next():
if self.fasta.pos % 1000 == 0 or self.fasta.pos == 1:sys.stderr.write('\r[fastal
ib] Reading FASTA into memory: %s' % (self.fasta.pos))sys.stderr.flush()self.ids
.append(self.fasta.id)self.sequences.append(self.fasta.seq)sys.stderr.write('\n'
)def close(self):self.fasta.close()class SequenceSource():def __init__(self, fas
ta_file_path, lazy_init = True, unique = False, allow_mixed_case = False):self.f
asta_file_path = fas
>temp_316|source:-*- coding: utf-8 -*-|start:2135|stop:2635
with('>')])self.reset()if self.unique:self.init_unique_hash()def init_unique_has
h(self):while self.next_regular():hash = hashlib.sha1(self.seq.upper()).hexdiges
t()if hash in self.unique_hash_dict:self.unique_hash_dict[hash]['ids'].append(se
lf.id)self.unique_hash_dict[hash]['count'] += 1else:self.unique_hash_dict[hash] 
= {'id' : self.id,'ids': [self.id],'seq': self.seq,'count': 1}self.unique_hash_l
ist = [i[1] for i in sorted([(self.unique_hash_dict[hash]['count'], hash)\for ha
sh in self.unique_ha
>temp_317|source:-*- coding: utf-8 -*-|start:3181|stop:3681
self):self.seq = Noneself.id = self.file_pointer.readline()[1:].strip()sequence 
= ''while 1:line = self.file_pointer.readline()if not line:if len(sequence):self
.seq = sequenceself.pos += 1return Trueelse:return Falseif line.startswith('>'):
self.file_pointer.seek(self.file_pointer.tell() - len(line))breaksequence += lin
e.strip()self.seq = sequence if self.allow_mixed_case else sequence.upper()self.
pos += 1return Truedef get_seq_by_read_id(self, read_id):self.reset()while self.
next():if self.id ==
>temp_318|source:-*- coding: utf-8 -*-|start:4246|stop:4746
lf.seq))self.reset()sys.stderr.write('\n')if not max_seq_len:max_seq_len = max(s
equence_lengths) + (int(max(sequence_lengths) / 100.0) or 10)seq_len_distributio
n = [0] * (max_seq_len + 1)for l in sequence_lengths:seq_len_distribution[l] += 
1fig = plt.figure(figsize = (16, 12))plt.rcParams.update({'axes.linewidth' : 0.9
})plt.rc('grid', color='0.50', linestyle='-', linewidth=0.1)gs = gridspec.GridSp
ec(10, 1)ax1 = plt.subplot(gs[0:8])plt.grid(True)plt.subplots_adjust(left=0.05, 
bottom = 0.03, top =
>temp_319|source:-*- coding: utf-8 -*-|start:5445|stop:5945
len_distribution) / 20.0))plt.figtext(0.5, 0.96, '%s' % (title), weight = 'black
', size = 'xx-large', ha = 'center')ax1 = plt.subplot(gs[9])plt.rcParams.update(
{'axes.edgecolor' : 20})plt.grid(False)plt.yticks([])plt.xticks([])plt.text(0.02
, 0.5, 'total: %s / mean: %.2f / std: %.2f / min: %s / max: %s'\% (len(sequence_
lengths),numpy.mean(sequence_lengths), numpy.std(sequence_lengths),\min(sequence
_lengths),\max(sequence_lengths)),\va = 'center', alpha = 0.8, size = 'x-large')
if dest == None:dest
>temp_320|source:-*- coding: utf-8 -*-|start:3987|stop:4487
atplotlib.pyplot as pltimport matplotlib.gridspec as gridspecsequence_lengths = 
[]self.reset()while self.next():if self.pos % 10000 == 0 or self.pos == 1:sys.st
derr.write('\r[fastalib] Reading: %s' % (self.pos))sys.stderr.flush()sequence_le
ngths.append(len(self.seq))self.reset()sys.stderr.write('\n')if not max_seq_len:
max_seq_len = max(sequence_lengths) + (int(max(sequence_lengths) / 100.0) or 10)
seq_len_distribution = [0] * (max_seq_len + 1)for l in sequence_lengths:seq_len_
distribution[l] += 1
>temp_321|source:-*- coding: utf-8 -*-|start:1304|stop:1804
 == 0 or self.fasta.pos == 1:sys.stderr.write('\r[fastalib] Reading FASTA into m
emory: %s' % (self.fasta.pos))sys.stderr.flush()self.ids.append(self.fasta.id)se
lf.sequences.append(self.fasta.seq)sys.stderr.write('\n')def close(self):self.fa
sta.close()class SequenceSource():def __init__(self, fasta_file_path, lazy_init 
= True, unique = False, allow_mixed_case = False):self.fasta_file_path = fasta_f
ile_pathself.name = Noneself.lazy_init = lazy_initself.allow_mixed_case = allow_
mixed_caseself.pos =
>temp_322|source:-*- coding: utf-8 -*-|start:4206|stop:4706
rr.flush()sequence_lengths.append(len(self.seq))self.reset()sys.stderr.write('\n
')if not max_seq_len:max_seq_len = max(sequence_lengths) + (int(max(sequence_len
gths) / 100.0) or 10)seq_len_distribution = [0] * (max_seq_len + 1)for l in sequ
ence_lengths:seq_len_distribution[l] += 1fig = plt.figure(figsize = (16, 12))plt
.rcParams.update({'axes.linewidth' : 0.9})plt.rc('grid', color='0.50', linestyle
='-', linewidth=0.1)gs = gridspec.GridSpec(10, 1)ax1 = plt.subplot(gs[0:8])plt.g
rid(True)plt.subplot
>temp_323|source:-*- coding: utf-8 -*-|start:167|stop:667
nse as published by the Free# Software Foundation; either version 2 of the Licen
se, or (at your option)# any later version.## Please read the docs/COPYING file.
import sysimport numpyimport hashlibclass FastaOutput:def __init__(self, output_
file_path):self.output_file_path = output_file_pathself.output_file_obj = open(o
utput_file_path, 'w')def store(self, entry, split = True, store_frequencies = Tr
ue):if entry.unique and store_frequencies:self.write_id('%s|%s' % (entry.id, 'fr
equency:%d' % len(en
>temp_324|source:-*- coding: utf-8 -*-|start:4865|stop:5365
 1), seq_len_distribution, y2 = 0, color = 'black', alpha = 0.15)plt.ylabel('num
ber of sequences')plt.xlabel('sequence length')if xtickstep == None:xtickstep = 
(max_seq_len / 50) or 1if ytickstep == None:ytickstep = max(seq_len_distribution
) / 20 or 1plt.xticks(range(xtickstep, max_seq_len + 1, xtickstep), rotation=90,
 size='xx-small')plt.yticks(range(0, max(seq_len_distribution) + 1, ytickstep),[
y for y in range(0, max(seq_len_distribution) + 1, ytickstep)],size='xx-small')p
lt.xlim(xmin = 0, xm
>temp_325|source:-*- coding: utf-8 -*-|start:681|stop:1181
:self.write_id(entry.id)self.write_seq(entry.seq, split)def write_id(self, id):s
elf.output_file_obj.write('>%s\n' % id)def write_seq(self, seq, split = True):if
 split:seq = self.split(seq)self.output_file_obj.write('%s\n' % seq)def split(se
lf, sequence, piece_length = 80):ticks = range(0, len(sequence), piece_length) +
 [len(sequence)]return '\n'.join([sequence[ticks[x]:ticks[x + 1]] for x in range
(0, len(ticks) - 1)])def close(self):self.output_file_obj.close()class ReadFasta
:def __init__(self, 
>temp_326|source:-*- coding: utf-8 -*-|start:5485|stop:5985
5, 0.96, '%s' % (title), weight = 'black', size = 'xx-large', ha = 'center')ax1 
= plt.subplot(gs[9])plt.rcParams.update({'axes.edgecolor' : 20})plt.grid(False)p
lt.yticks([])plt.xticks([])plt.text(0.02, 0.5, 'total: %s / mean: %.2f / std: %.
2f / min: %s / max: %s'\% (len(sequence_lengths),numpy.mean(sequence_lengths), n
umpy.std(sequence_lengths),\min(sequence_lengths),\max(sequence_lengths)),\va = 
'center', alpha = 0.8, size = 'x-large')if dest == None:dest = self.fasta_file_p
athtry:plt.savefig(d
>temp_327|source:-*- coding: utf-8 -*-|start:1443|stop:1943
pend(self.fasta.id)self.sequences.append(self.fasta.seq)sys.stderr.write('\n')de
f close(self):self.fasta.close()class SequenceSource():def __init__(self, fasta_
file_path, lazy_init = True, unique = False, allow_mixed_case = False):self.fast
a_file_path = fasta_file_pathself.name = Noneself.lazy_init = lazy_initself.allo
w_mixed_case = allow_mixed_caseself.pos = 0self.id  = Noneself.seq = Noneself.id
s = []self.unique = uniqueself.unique_hash_dict = {}self.unique_hash_list = []se
lf.unique_next_hash 
>temp_328|source:-*- coding: utf-8 -*-|start:2992|stop:3492
ash_entry['seq'] if self.allow_mixed_case else hash_entry['seq'].upper()self.id 
 = hash_entry['id']self.ids = hash_entry['ids']return Trueelse:return Falseelse:
return Falsedef next_regular(self):self.seq = Noneself.id = self.file_pointer.re
adline()[1:].strip()sequence = ''while 1:line = self.file_pointer.readline()if n
ot line:if len(sequence):self.seq = sequenceself.pos += 1return Trueelse:return 
Falseif line.startswith('>'):self.file_pointer.seek(self.file_pointer.tell() - l
en(line))breaksequen
>temp_329|source:-*- coding: utf-8 -*-|start:6613|stop:7113
lscores = ''while 1:line = self.file_pointer.readline()if not line:if len(qualsc
ores):self.quals = qualscores.strip()self.quals_int = [int(q) for q in self.qual
s.split()]self.pos += 1return Trueelse:return Falseif line.startswith('>'):self.
file_pointer.seek(self.file_pointer.tell() - len(line))breakqualscores += ' ' + 
line.strip()self.quals = qualscores.strip()self.quals_int = [int(q) for q in sel
f.quals.split()]self.pos += 1return Truedef close(self):self.file_pointer.close(
)def reset(self):sel
>temp_330|source:-*- coding: utf-8 -*-|start:1367|stop:1867
ding FASTA into memory: %s' % (self.fasta.pos))sys.stderr.flush()self.ids.append
(self.fasta.id)self.sequences.append(self.fasta.seq)sys.stderr.write('\n')def cl
ose(self):self.fasta.close()class SequenceSource():def __init__(self, fasta_file
_path, lazy_init = True, unique = False, allow_mixed_case = False):self.fasta_fi
le_path = fasta_file_pathself.name = Noneself.lazy_init = lazy_initself.allow_mi
xed_case = allow_mixed_caseself.pos = 0self.id  = Noneself.seq = Noneself.ids = 
[]self.unique = uniq
>temp_331|source:-*- coding: utf-8 -*-|start:3860|stop:4360
sualize_sequence_length_distribution(self, title, dest = None, max_seq_len = Non
e, xtickstep = None, ytickstep = None):import matplotlib.pyplot as pltimport mat
plotlib.gridspec as gridspecsequence_lengths = []self.reset()while self.next():i
f self.pos % 10000 == 0 or self.pos == 1:sys.stderr.write('\r[fastalib] Reading:
 %s' % (self.pos))sys.stderr.flush()sequence_lengths.append(len(self.seq))self.r
eset()sys.stderr.write('\n')if not max_seq_len:max_seq_len = max(sequence_length
s) + (int(max(sequen
>temp_332|source:-*- coding: utf-8 -*-|start:3502|stop:4002
.strip()self.seq = sequence if self.allow_mixed_case else sequence.upper()self.p
os += 1return Truedef get_seq_by_read_id(self, read_id):self.reset()while self.n
ext():if self.id == read_id:return self.seqreturn Falsedef close(self):self.file
_pointer.close()def reset(self):self.pos = 0self.id  = Noneself.seq = Noneself.i
ds = []self.file_pointer.seek(0)def visualize_sequence_length_distribution(self,
 title, dest = None, max_seq_len = None, xtickstep = None, ytickstep = None):imp
ort matplotlib.pyplo
>temp_333|source:-*- coding: utf-8 -*-|start:2447|stop:2947
t[hash] = {'id' : self.id,'ids': [self.id],'seq': self.seq,'count': 1}self.uniqu
e_hash_list = [i[1] for i in sorted([(self.unique_hash_dict[hash]['count'], hash
)\for hash in self.unique_hash_dict], reverse = True)]self.total_unique = len(se
lf.unique_hash_dict)self.reset()def next(self):if self.unique:return self.next_u
nique()else:return self.next_regular()def next_unique(self):if self.unique:if se
lf.total_unique > 0 and self.pos < self.total_unique:hash_entry = self.unique_ha
sh_dict[self.unique_
>temp_334|source:-*- coding: utf-8 -*-|start:2403|stop:2903
hash]['count'] += 1else:self.unique_hash_dict[hash] = {'id' : self.id,'ids': [se
lf.id],'seq': self.seq,'count': 1}self.unique_hash_list = [i[1] for i in sorted(
[(self.unique_hash_dict[hash]['count'], hash)\for hash in self.unique_hash_dict]
, reverse = True)]self.total_unique = len(self.unique_hash_dict)self.reset()def 
next(self):if self.unique:return self.next_unique()else:return self.next_regular
()def next_unique(self):if self.unique:if self.total_unique > 0 and self.pos < s
elf.total_unique:has
>temp_335|source:-*- coding: utf-8 -*-|start:206|stop:706
Foundation; either version 2 of the License, or (at your option)# any later vers
ion.## Please read the docs/COPYING file.import sysimport numpyimport hashlibcla
ss FastaOutput:def __init__(self, output_file_path):self.output_file_path = outp
ut_file_pathself.output_file_obj = open(output_file_path, 'w')def store(self, en
try, split = True, store_frequencies = True):if entry.unique and store_frequenci
es:self.write_id('%s|%s' % (entry.id, 'frequency:%d' % len(entry.ids)))else:self
.write_id(entry.id)s
>temp_336|source:-*- coding: utf-8 -*-|start:1765|stop:2265
mixed_case = allow_mixed_caseself.pos = 0self.id  = Noneself.seq = Noneself.ids 
= []self.unique = uniqueself.unique_hash_dict = {}self.unique_hash_list = []self
.unique_next_hash = 0self.file_pointer = open(self.fasta_file_path)self.file_poi
nter.seek(0)if self.lazy_init:self.total_seq = Noneelse:self.total_seq = len([l 
for l in self.file_pointer.readlines() if l.startswith('>')])self.reset()if self
.unique:self.init_unique_hash()def init_unique_hash(self):while self.next_regula
r():hash = hashlib.s
>temp_337|source:-*- coding: utf-8 -*-|start:4861|stop:5361
en + 1), seq_len_distribution, y2 = 0, color = 'black', alpha = 0.15)plt.ylabel(
'number of sequences')plt.xlabel('sequence length')if xtickstep == None:xtickste
p = (max_seq_len / 50) or 1if ytickstep == None:ytickstep = max(seq_len_distribu
tion) / 20 or 1plt.xticks(range(xtickstep, max_seq_len + 1, xtickstep), rotation
=90, size='xx-small')plt.yticks(range(0, max(seq_len_distribution) + 1, ytickste
p),[y for y in range(0, max(seq_len_distribution) + 1, ytickstep)],size='xx-smal
l')plt.xlim(xmin = 0
>temp_338|source:-*- coding: utf-8 -*-|start:5998|stop:6498
except:plt.savefig(dest + '.png')try:plt.show()except:passreturnclass QualSource
:def __init__(self, quals_file_path, lazy_init = True):self.quals_file_path = qu
als_file_pathself.name = Noneself.lazy_init = lazy_initself.pos = 0self.id  = No
neself.quals = Noneself.quals_int = Noneself.ids = []self.file_pointer = open(se
lf.quals_file_path)self.file_pointer.seek(0)if self.lazy_init:self.total_quals =
 Noneelse:self.total_quals = len([l for l in self.file_pointer.readlines() if l.
startswith('>')])sel
>temp_339|source:-*- coding: utf-8 -*-|start:668|stop:1168
ry.ids)))else:self.write_id(entry.id)self.write_seq(entry.seq, split)def write_i
d(self, id):self.output_file_obj.write('>%s\n' % id)def write_seq(self, seq, spl
it = True):if split:seq = self.split(seq)self.output_file_obj.write('%s\n' % seq
)def split(self, sequence, piece_length = 80):ticks = range(0, len(sequence), pi
ece_length) + [len(sequence)]return '\n'.join([sequence[ticks[x]:ticks[x + 1]] f
or x in range(0, len(ticks) - 1)])def close(self):self.output_file_obj.close()cl
ass ReadFasta:def __
>temp_340|source:-*- coding: utf-8 -*-|start:739|stop:1239
f write_id(self, id):self.output_file_obj.write('>%s\n' % id)def write_seq(self,
 seq, split = True):if split:seq = self.split(seq)self.output_file_obj.write('%s
\n' % seq)def split(self, sequence, piece_length = 80):ticks = range(0, len(sequ
ence), piece_length) + [len(sequence)]return '\n'.join([sequence[ticks[x]:ticks[
x + 1]] for x in range(0, len(ticks) - 1)])def close(self):self.output_file_obj.
close()class ReadFasta:def __init__(self, f_name):self.ids = []self.sequences = 
[]self.fasta = Seque
>temp_341|source:-*- coding: utf-8 -*-|start:4008|stop:4508
ltimport matplotlib.gridspec as gridspecsequence_lengths = []self.reset()while s
elf.next():if self.pos % 10000 == 0 or self.pos == 1:sys.stderr.write('\r[fastal
ib] Reading: %s' % (self.pos))sys.stderr.flush()sequence_lengths.append(len(self
.seq))self.reset()sys.stderr.write('\n')if not max_seq_len:max_seq_len = max(seq
uence_lengths) + (int(max(sequence_lengths) / 100.0) or 10)seq_len_distribution 
= [0] * (max_seq_len + 1)for l in sequence_lengths:seq_len_distribution[l] += 1f
ig = plt.figure(figs
>temp_342|source:-*- coding: utf-8 -*-|start:23|stop:523
C) 2014, A. Murat Eren## This program is free software; you can redistribute it 
and/or modify it under# the terms of the GNU General Public License as published
 by the Free# Software Foundation; either version 2 of the License, or (at your 
option)# any later version.## Please read the docs/COPYING file.import sysimport
 numpyimport hashlibclass FastaOutput:def __init__(self, output_file_path):self.
output_file_path = output_file_pathself.output_file_obj = open(output_file_path,
 'w')def store(self,
>temp_343|source:-*- coding: utf-8 -*-|start:4986|stop:5486
ngth')if xtickstep == None:xtickstep = (max_seq_len / 50) or 1if ytickstep == No
ne:ytickstep = max(seq_len_distribution) / 20 or 1plt.xticks(range(xtickstep, ma
x_seq_len + 1, xtickstep), rotation=90, size='xx-small')plt.yticks(range(0, max(
seq_len_distribution) + 1, ytickstep),[y for y in range(0, max(seq_len_distribut
ion) + 1, ytickstep)],size='xx-small')plt.xlim(xmin = 0, xmax = max_seq_len)plt.
ylim(ymin = 0, ymax = max(seq_len_distribution) + (max(seq_len_distribution) / 2
0.0))plt.figtext(0.5
>temp_344|source:-*- coding: utf-8 -*-|start:867|stop:1367
q)self.output_file_obj.write('%s\n' % seq)def split(self, sequence, piece_length
 = 80):ticks = range(0, len(sequence), piece_length) + [len(sequence)]return '\n
'.join([sequence[ticks[x]:ticks[x + 1]] for x in range(0, len(ticks) - 1)])def c
lose(self):self.output_file_obj.close()class ReadFasta:def __init__(self, f_name
):self.ids = []self.sequences = []self.fasta = SequenceSource(f_name)while self.
fasta.next():if self.fasta.pos % 1000 == 0 or self.fasta.pos == 1:sys.stderr.wri
te('\r[fastalib] Rea
>temp_345|source:-*- coding: utf-8 -*-|start:121|stop:621
nder# the terms of the GNU General Public License as published by the Free# Soft
ware Foundation; either version 2 of the License, or (at your option)# any later
 version.## Please read the docs/COPYING file.import sysimport numpyimport hashl
ibclass FastaOutput:def __init__(self, output_file_path):self.output_file_path =
 output_file_pathself.output_file_obj = open(output_file_path, 'w')def store(sel
f, entry, split = True, store_frequencies = True):if entry.unique and store_freq
uencies:self.write_i
>temp_346|source:-*- coding: utf-8 -*-|start:5683|stop:6183
02, 0.5, 'total: %s / mean: %.2f / std: %.2f / min: %s / max: %s'\% (len(sequenc
e_lengths),numpy.mean(sequence_lengths), numpy.std(sequence_lengths),\min(sequen
ce_lengths),\max(sequence_lengths)),\va = 'center', alpha = 0.8, size = 'x-large
')if dest == None:dest = self.fasta_file_pathtry:plt.savefig(dest + '.pdf')excep
t:plt.savefig(dest + '.png')try:plt.show()except:passreturnclass QualSource:def 
__init__(self, quals_file_path, lazy_init = True):self.quals_file_path = quals_f
ile_pathself.name = 
>temp_347|source:-*- coding: utf-8 -*-|start:908|stop:1408
)def split(self, sequence, piece_length = 80):ticks = range(0, len(sequence), pi
ece_length) + [len(sequence)]return '\n'.join([sequence[ticks[x]:ticks[x + 1]] f
or x in range(0, len(ticks) - 1)])def close(self):self.output_file_obj.close()cl
ass ReadFasta:def __init__(self, f_name):self.ids = []self.sequences = []self.fa
sta = SequenceSource(f_name)while self.fasta.next():if self.fasta.pos % 1000 == 
0 or self.fasta.pos == 1:sys.stderr.write('\r[fastalib] Reading FASTA into memor
y: %s' % (self.fasta
>temp_348|source:-*- coding: utf-8 -*-|start:5030|stop:5530
seq_len / 50) or 1if ytickstep == None:ytickstep = max(seq_len_distribution) / 2
0 or 1plt.xticks(range(xtickstep, max_seq_len + 1, xtickstep), rotation=90, size
='xx-small')plt.yticks(range(0, max(seq_len_distribution) + 1, ytickstep),[y for
 y in range(0, max(seq_len_distribution) + 1, ytickstep)],size='xx-small')plt.xl
im(xmin = 0, xmax = max_seq_len)plt.ylim(ymin = 0, ymax = max(seq_len_distributi
on) + (max(seq_len_distribution) / 20.0))plt.figtext(0.5, 0.96, '%s' % (title), 
weight = 'black', si
>temp_349|source:-*- coding: utf-8 -*-|start:1554|stop:2054
)class SequenceSource():def __init__(self, fasta_file_path, lazy_init = True, un
ique = False, allow_mixed_case = False):self.fasta_file_path = fasta_file_pathse
lf.name = Noneself.lazy_init = lazy_initself.allow_mixed_case = allow_mixed_case
self.pos = 0self.id  = Noneself.seq = Noneself.ids = []self.unique = uniqueself.
unique_hash_dict = {}self.unique_hash_list = []self.unique_next_hash = 0self.fil
e_pointer = open(self.fasta_file_path)self.file_pointer.seek(0)if self.lazy_init
:self.total_seq = No
>temp_350|source:-*- coding: utf-8 -*-|start:3127|stop:3627
rueelse:return Falseelse:return Falsedef next_regular(self):self.seq = Noneself.
id = self.file_pointer.readline()[1:].strip()sequence = ''while 1:line = self.fi
le_pointer.readline()if not line:if len(sequence):self.seq = sequenceself.pos +=
 1return Trueelse:return Falseif line.startswith('>'):self.file_pointer.seek(sel
f.file_pointer.tell() - len(line))breaksequence += line.strip()self.seq = sequen
ce if self.allow_mixed_case else sequence.upper()self.pos += 1return Truedef get
_seq_by_read_id(self
>temp_351|source:-*- coding: utf-8 -*-|start:707|stop:1207
lf.write_seq(entry.seq, split)def write_id(self, id):self.output_file_obj.write(
'>%s\n' % id)def write_seq(self, seq, split = True):if split:seq = self.split(se
q)self.output_file_obj.write('%s\n' % seq)def split(self, sequence, piece_length
 = 80):ticks = range(0, len(sequence), piece_length) + [len(sequence)]return '\n
'.join([sequence[ticks[x]:ticks[x + 1]] for x in range(0, len(ticks) - 1)])def c
lose(self):self.output_file_obj.close()class ReadFasta:def __init__(self, f_name
):self.ids = []self.
>temp_352|source:-*- coding: utf-8 -*-|start:4750|stop:5250
5, right = 0.98)plt.plot(seq_len_distribution, color = 'black', alpha = 0.3)plt.
fill_between(range(0, max_seq_len + 1), seq_len_distribution, y2 = 0, color = 'b
lack', alpha = 0.15)plt.ylabel('number of sequences')plt.xlabel('sequence length
')if xtickstep == None:xtickstep = (max_seq_len / 50) or 1if ytickstep == None:y
tickstep = max(seq_len_distribution) / 20 or 1plt.xticks(range(xtickstep, max_se
q_len + 1, xtickstep), rotation=90, size='xx-small')plt.yticks(range(0, max(seq_
len_distribution) + 
>temp_353|source:-*- coding: utf-8 -*-|start:6583|stop:7083
= Noneself.quals_int = Nonequalscores = ''while 1:line = self.file_pointer.readl
ine()if not line:if len(qualscores):self.quals = qualscores.strip()self.quals_in
t = [int(q) for q in self.quals.split()]self.pos += 1return Trueelse:return Fals
eif line.startswith('>'):self.file_pointer.seek(self.file_pointer.tell() - len(l
ine))breakqualscores += ' ' + line.strip()self.quals = qualscores.strip()self.qu
als_int = [int(q) for q in self.quals.split()]self.pos += 1return Truedef close(
self):self.file_poin
>temp_354|source:-*- coding: utf-8 -*-|start:5825|stop:6325
ngths),\min(sequence_lengths),\max(sequence_lengths)),\va = 'center', alpha = 0.
8, size = 'x-large')if dest == None:dest = self.fasta_file_pathtry:plt.savefig(d
est + '.pdf')except:plt.savefig(dest + '.png')try:plt.show()except:passreturncla
ss QualSource:def __init__(self, quals_file_path, lazy_init = True):self.quals_f
ile_path = quals_file_pathself.name = Noneself.lazy_init = lazy_initself.pos = 0
self.id  = Noneself.quals = Noneself.quals_int = Noneself.ids = []self.file_poin
ter = open(self.qual
>temp_355|source:-*- coding: utf-8 -*-|start:190|stop:690
 Free# Software Foundation; either version 2 of the License, or (at your option)
# any later version.## Please read the docs/COPYING file.import sysimport numpyi
mport hashlibclass FastaOutput:def __init__(self, output_file_path):self.output_
file_path = output_file_pathself.output_file_obj = open(output_file_path, 'w')de
f store(self, entry, split = True, store_frequencies = True):if entry.unique and
 store_frequencies:self.write_id('%s|%s' % (entry.id, 'frequency:%d' % len(entry
.ids)))else:self.wri
>temp_356|source:-*- coding: utf-8 -*-|start:2592|stop:3092
['count'], hash)\for hash in self.unique_hash_dict], reverse = True)]self.total_
unique = len(self.unique_hash_dict)self.reset()def next(self):if self.unique:ret
urn self.next_unique()else:return self.next_regular()def next_unique(self):if se
lf.unique:if self.total_unique > 0 and self.pos < self.total_unique:hash_entry =
 self.unique_hash_dict[self.unique_hash_list[self.pos]]self.pos += 1self.seq = h
ash_entry['seq'] if self.allow_mixed_case else hash_entry['seq'].upper()self.id 
 = hash_entry['id']s
>temp_357|source:-*- coding: utf-8 -*-|start:4076|stop:4576
set()while self.next():if self.pos % 10000 == 0 or self.pos == 1:sys.stderr.writ
e('\r[fastalib] Reading: %s' % (self.pos))sys.stderr.flush()sequence_lengths.app
end(len(self.seq))self.reset()sys.stderr.write('\n')if not max_seq_len:max_seq_l
en = max(sequence_lengths) + (int(max(sequence_lengths) / 100.0) or 10)seq_len_d
istribution = [0] * (max_seq_len + 1)for l in sequence_lengths:seq_len_distribut
ion[l] += 1fig = plt.figure(figsize = (16, 12))plt.rcParams.update({'axes.linewi
dth' : 0.9})plt.rc('
>temp_358|source:-*- coding: utf-8 -*-|start:5180|stop:5680
n=90, size='xx-small')plt.yticks(range(0, max(seq_len_distribution) + 1, ytickst
ep),[y for y in range(0, max(seq_len_distribution) + 1, ytickstep)],size='xx-sma
ll')plt.xlim(xmin = 0, xmax = max_seq_len)plt.ylim(ymin = 0, ymax = max(seq_len_
distribution) + (max(seq_len_distribution) / 20.0))plt.figtext(0.5, 0.96, '%s' %
 (title), weight = 'black', size = 'xx-large', ha = 'center')ax1 = plt.subplot(g
s[9])plt.rcParams.update({'axes.edgecolor' : 20})plt.grid(False)plt.yticks([])pl
t.xticks([])plt.text
>temp_359|source:-*- coding: utf-8 -*-|start:1360|stop:1860
ib] Reading FASTA into memory: %s' % (self.fasta.pos))sys.stderr.flush()self.ids
.append(self.fasta.id)self.sequences.append(self.fasta.seq)sys.stderr.write('\n'
)def close(self):self.fasta.close()class SequenceSource():def __init__(self, fas
ta_file_path, lazy_init = True, unique = False, allow_mixed_case = False):self.f
asta_file_path = fasta_file_pathself.name = Noneself.lazy_init = lazy_initself.a
llow_mixed_case = allow_mixed_caseself.pos = 0self.id  = Noneself.seq = Noneself
.ids = []self.unique
>temp_360|source:-*- coding: utf-8 -*-|start:6022|stop:6522
+ '.png')try:plt.show()except:passreturnclass QualSource:def __init__(self, qual
s_file_path, lazy_init = True):self.quals_file_path = quals_file_pathself.name =
 Noneself.lazy_init = lazy_initself.pos = 0self.id  = Noneself.quals = Noneself.
quals_int = Noneself.ids = []self.file_pointer = open(self.quals_file_path)self.
file_pointer.seek(0)if self.lazy_init:self.total_quals = Noneelse:self.total_qua
ls = len([l for l in self.file_pointer.readlines() if l.startswith('>')])self.re
set()def next(self):
>temp_361|source:-*- coding: utf-8 -*-|start:1693|stop:2193
h = fasta_file_pathself.name = Noneself.lazy_init = lazy_initself.allow_mixed_ca
se = allow_mixed_caseself.pos = 0self.id  = Noneself.seq = Noneself.ids = []self
.unique = uniqueself.unique_hash_dict = {}self.unique_hash_list = []self.unique_
next_hash = 0self.file_pointer = open(self.fasta_file_path)self.file_pointer.see
k(0)if self.lazy_init:self.total_seq = Noneelse:self.total_seq = len([l for l in
 self.file_pointer.readlines() if l.startswith('>')])self.reset()if self.unique:
self.init_unique_has
>temp_362|source:-*- coding: utf-8 -*-|start:1150|stop:1650
s ReadFasta:def __init__(self, f_name):self.ids = []self.sequences = []self.fast
a = SequenceSource(f_name)while self.fasta.next():if self.fasta.pos % 1000 == 0 
or self.fasta.pos == 1:sys.stderr.write('\r[fastalib] Reading FASTA into memory:
 %s' % (self.fasta.pos))sys.stderr.flush()self.ids.append(self.fasta.id)self.seq
uences.append(self.fasta.seq)sys.stderr.write('\n')def close(self):self.fasta.cl
ose()class SequenceSource():def __init__(self, fasta_file_path, lazy_init = True
, unique = False, al
>temp_363|source:-*- coding: utf-8 -*-|start:315|stop:815
OPYING file.import sysimport numpyimport hashlibclass FastaOutput:def __init__(s
elf, output_file_path):self.output_file_path = output_file_pathself.output_file_
obj = open(output_file_path, 'w')def store(self, entry, split = True, store_freq
uencies = True):if entry.unique and store_frequencies:self.write_id('%s|%s' % (e
ntry.id, 'frequency:%d' % len(entry.ids)))else:self.write_id(entry.id)self.write
_seq(entry.seq, split)def write_id(self, id):self.output_file_obj.write('>%s\n' 
% id)def write_seq(s
>temp_364|source:-*- coding: utf-8 -*-|start:6709|stop:7209
 = qualscores.strip()self.quals_int = [int(q) for q in self.quals.split()]self.p
os += 1return Trueelse:return Falseif line.startswith('>'):self.file_pointer.see
k(self.file_pointer.tell() - len(line))breakqualscores += ' ' + line.strip()self
.quals = qualscores.strip()self.quals_int = [int(q) for q in self.quals.split()]
self.pos += 1return Truedef close(self):self.file_pointer.close()def reset(self)
:self.pos = 0self.id  = Noneself.quals = Noneself.quals_int = Noneself.ids = []s
elf.file_pointer.see
>temp_365|source:-*- coding: utf-8 -*-|start:743|stop:1243
ite_id(self, id):self.output_file_obj.write('>%s\n' % id)def write_seq(self, seq
, split = True):if split:seq = self.split(seq)self.output_file_obj.write('%s\n' 
% seq)def split(self, sequence, piece_length = 80):ticks = range(0, len(sequence
), piece_length) + [len(sequence)]return '\n'.join([sequence[ticks[x]:ticks[x + 
1]] for x in range(0, len(ticks) - 1)])def close(self):self.output_file_obj.clos
e()class ReadFasta:def __init__(self, f_name):self.ids = []self.sequences = []se
lf.fasta = SequenceS
>temp_366|source:-*- coding: utf-8 -*-|start:984|stop:1484
, piece_length) + [len(sequence)]return '\n'.join([sequence[ticks[x]:ticks[x + 1
]] for x in range(0, len(ticks) - 1)])def close(self):self.output_file_obj.close
()class ReadFasta:def __init__(self, f_name):self.ids = []self.sequences = []sel
f.fasta = SequenceSource(f_name)while self.fasta.next():if self.fasta.pos % 1000
 == 0 or self.fasta.pos == 1:sys.stderr.write('\r[fastalib] Reading FASTA into m
emory: %s' % (self.fasta.pos))sys.stderr.flush()self.ids.append(self.fasta.id)se
lf.sequences.append(
>temp_367|source:-*- coding: utf-8 -*-|start:4646|stop:5146
ec(10, 1)ax1 = plt.subplot(gs[0:8])plt.grid(True)plt.subplots_adjust(left=0.05, 
bottom = 0.03, top = 0.95, right = 0.98)plt.plot(seq_len_distribution, color = '
black', alpha = 0.3)plt.fill_between(range(0, max_seq_len + 1), seq_len_distribu
tion, y2 = 0, color = 'black', alpha = 0.15)plt.ylabel('number of sequences')plt
.xlabel('sequence length')if xtickstep == None:xtickstep = (max_seq_len / 50) or
 1if ytickstep == None:ytickstep = max(seq_len_distribution) / 20 or 1plt.xticks
(range(xtickstep, ma
>temp_368|source:-*- coding: utf-8 -*-|start:6724|stop:7224
trip()self.quals_int = [int(q) for q in self.quals.split()]self.pos += 1return T
rueelse:return Falseif line.startswith('>'):self.file_pointer.seek(self.file_poi
nter.tell() - len(line))breakqualscores += ' ' + line.strip()self.quals = qualsc
ores.strip()self.quals_int = [int(q) for q in self.quals.split()]self.pos += 1re
turn Truedef close(self):self.file_pointer.close()def reset(self):self.pos = 0se
lf.id  = Noneself.quals = Noneself.quals_int = Noneself.ids = []self.file_pointe
r.seek(0)if __name__
>temp_369|source:-*- coding: utf-8 -*-|start:293|stop:793
Please read the docs/COPYING file.import sysimport numpyimport hashlibclass Fast
aOutput:def __init__(self, output_file_path):self.output_file_path = output_file
_pathself.output_file_obj = open(output_file_path, 'w')def store(self, entry, sp
lit = True, store_frequencies = True):if entry.unique and store_frequencies:self
.write_id('%s|%s' % (entry.id, 'frequency:%d' % len(entry.ids)))else:self.write_
id(entry.id)self.write_seq(entry.seq, split)def write_id(self, id):self.output_f
ile_obj.write('>%s\n
>temp_370|source:-*- coding: utf-8 -*-|start:4031|stop:4531
dspec as gridspecsequence_lengths = []self.reset()while self.next():if self.pos 
% 10000 == 0 or self.pos == 1:sys.stderr.write('\r[fastalib] Reading: %s' % (sel
f.pos))sys.stderr.flush()sequence_lengths.append(len(self.seq))self.reset()sys.s
tderr.write('\n')if not max_seq_len:max_seq_len = max(sequence_lengths) + (int(m
ax(sequence_lengths) / 100.0) or 10)seq_len_distribution = [0] * (max_seq_len + 
1)for l in sequence_lengths:seq_len_distribution[l] += 1fig = plt.figure(figsize
 = (16, 12))plt.rcPa
>temp_371|source:-*- coding: utf-8 -*-|start:1310|stop:1810
or self.fasta.pos == 1:sys.stderr.write('\r[fastalib] Reading FASTA into memory:
 %s' % (self.fasta.pos))sys.stderr.flush()self.ids.append(self.fasta.id)self.seq
uences.append(self.fasta.seq)sys.stderr.write('\n')def close(self):self.fasta.cl
ose()class SequenceSource():def __init__(self, fasta_file_path, lazy_init = True
, unique = False, allow_mixed_case = False):self.fasta_file_path = fasta_file_pa
thself.name = Noneself.lazy_init = lazy_initself.allow_mixed_case = allow_mixed_
caseself.pos = 0self
>temp_372|source:-*- coding: utf-8 -*-|start:6086|stop:6586
nit__(self, quals_file_path, lazy_init = True):self.quals_file_path = quals_file
_pathself.name = Noneself.lazy_init = lazy_initself.pos = 0self.id  = Noneself.q
uals = Noneself.quals_int = Noneself.ids = []self.file_pointer = open(self.quals
_file_path)self.file_pointer.seek(0)if self.lazy_init:self.total_quals = Noneels
e:self.total_quals = len([l for l in self.file_pointer.readlines() if l.startswi
th('>')])self.reset()def next(self):self.id = self.file_pointer.readline()[1:].s
trip()self.quals = N
>temp_373|source:-*- coding: utf-8 -*-|start:2973|stop:3473
os += 1self.seq = hash_entry['seq'] if self.allow_mixed_case else hash_entry['se
q'].upper()self.id  = hash_entry['id']self.ids = hash_entry['ids']return Trueels
e:return Falseelse:return Falsedef next_regular(self):self.seq = Noneself.id = s
elf.file_pointer.readline()[1:].strip()sequence = ''while 1:line = self.file_poi
nter.readline()if not line:if len(sequence):self.seq = sequenceself.pos += 1retu
rn Trueelse:return Falseif line.startswith('>'):self.file_pointer.seek(self.file
_pointer.tell() - le
>temp_374|source:-*- coding: utf-8 -*-|start:4695|stop:5195
plt.subplots_adjust(left=0.05, bottom = 0.03, top = 0.95, right = 0.98)plt.plot(
seq_len_distribution, color = 'black', alpha = 0.3)plt.fill_between(range(0, max
_seq_len + 1), seq_len_distribution, y2 = 0, color = 'black', alpha = 0.15)plt.y
label('number of sequences')plt.xlabel('sequence length')if xtickstep == None:xt
ickstep = (max_seq_len / 50) or 1if ytickstep == None:ytickstep = max(seq_len_di
stribution) / 20 or 1plt.xticks(range(xtickstep, max_seq_len + 1, xtickstep), ro
tation=90, size='xx-
>temp_375|source:-*- coding: utf-8 -*-|start:4304|stop:4804
en:max_seq_len = max(sequence_lengths) + (int(max(sequence_lengths) / 100.0) or 
10)seq_len_distribution = [0] * (max_seq_len + 1)for l in sequence_lengths:seq_l
en_distribution[l] += 1fig = plt.figure(figsize = (16, 12))plt.rcParams.update({
'axes.linewidth' : 0.9})plt.rc('grid', color='0.50', linestyle='-', linewidth=0.
1)gs = gridspec.GridSpec(10, 1)ax1 = plt.subplot(gs[0:8])plt.grid(True)plt.subpl
ots_adjust(left=0.05, bottom = 0.03, top = 0.95, right = 0.98)plt.plot(seq_len_d
istribution, color =
>temp_376|source:-*- coding: utf-8 -*-|start:5275|stop:5775
 range(0, max(seq_len_distribution) + 1, ytickstep)],size='xx-small')plt.xlim(xm
in = 0, xmax = max_seq_len)plt.ylim(ymin = 0, ymax = max(seq_len_distribution) +
 (max(seq_len_distribution) / 20.0))plt.figtext(0.5, 0.96, '%s' % (title), weigh
t = 'black', size = 'xx-large', ha = 'center')ax1 = plt.subplot(gs[9])plt.rcPara
ms.update({'axes.edgecolor' : 20})plt.grid(False)plt.yticks([])plt.xticks([])plt
.text(0.02, 0.5, 'total: %s / mean: %.2f / std: %.2f / min: %s / max: %s'\% (len
(sequence_lengths),n
>temp_377|source:-*- coding: utf-8 -*-|start:4051|stop:4551
uence_lengths = []self.reset()while self.next():if self.pos % 10000 == 0 or self
.pos == 1:sys.stderr.write('\r[fastalib] Reading: %s' % (self.pos))sys.stderr.fl
ush()sequence_lengths.append(len(self.seq))self.reset()sys.stderr.write('\n')if 
not max_seq_len:max_seq_len = max(sequence_lengths) + (int(max(sequence_lengths)
 / 100.0) or 10)seq_len_distribution = [0] * (max_seq_len + 1)for l in sequence_
lengths:seq_len_distribution[l] += 1fig = plt.figure(figsize = (16, 12))plt.rcPa
rams.update({'axes.l
>temp_378|source:-*- coding: utf-8 -*-|start:49|stop:549
his program is free software; you can redistribute it and/or modify it under# th
e terms of the GNU General Public License as published by the Free# Software Fou
ndation; either version 2 of the License, or (at your option)# any later version
.## Please read the docs/COPYING file.import sysimport numpyimport hashlibclass 
FastaOutput:def __init__(self, output_file_path):self.output_file_path = output_
file_pathself.output_file_obj = open(output_file_path, 'w')def store(self, entry
, split = True, stor
>temp_379|source:-*- coding: utf-8 -*-|start:4059|stop:4559
ngths = []self.reset()while self.next():if self.pos % 10000 == 0 or self.pos == 
1:sys.stderr.write('\r[fastalib] Reading: %s' % (self.pos))sys.stderr.flush()seq
uence_lengths.append(len(self.seq))self.reset()sys.stderr.write('\n')if not max_
seq_len:max_seq_len = max(sequence_lengths) + (int(max(sequence_lengths) / 100.0
) or 10)seq_len_distribution = [0] * (max_seq_len + 1)for l in sequence_lengths:
seq_len_distribution[l] += 1fig = plt.figure(figsize = (16, 12))plt.rcParams.upd
ate({'axes.linewidth
>temp_380|source:-*- coding: utf-8 -*-|start:5521|stop:6021
lack', size = 'xx-large', ha = 'center')ax1 = plt.subplot(gs[9])plt.rcParams.upd
ate({'axes.edgecolor' : 20})plt.grid(False)plt.yticks([])plt.xticks([])plt.text(
0.02, 0.5, 'total: %s / mean: %.2f / std: %.2f / min: %s / max: %s'\% (len(seque
nce_lengths),numpy.mean(sequence_lengths), numpy.std(sequence_lengths),\min(sequ
ence_lengths),\max(sequence_lengths)),\va = 'center', alpha = 0.8, size = 'x-lar
ge')if dest == None:dest = self.fasta_file_pathtry:plt.savefig(dest + '.pdf')exc
ept:plt.savefig(dest
>temp_381|source:-*- coding: utf-8 -*-|start:695|stop:1195
(entry.id)self.write_seq(entry.seq, split)def write_id(self, id):self.output_fil
e_obj.write('>%s\n' % id)def write_seq(self, seq, split = True):if split:seq = s
elf.split(seq)self.output_file_obj.write('%s\n' % seq)def split(self, sequence, 
piece_length = 80):ticks = range(0, len(sequence), piece_length) + [len(sequence
)]return '\n'.join([sequence[ticks[x]:ticks[x + 1]] for x in range(0, len(ticks)
 - 1)])def close(self):self.output_file_obj.close()class ReadFasta:def __init__(
self, f_name):self.i
>temp_382|source:-*- coding: utf-8 -*-|start:3077|stop:3577
sh_entry['id']self.ids = hash_entry['ids']return Trueelse:return Falseelse:retur
n Falsedef next_regular(self):self.seq = Noneself.id = self.file_pointer.readlin
e()[1:].strip()sequence = ''while 1:line = self.file_pointer.readline()if not li
ne:if len(sequence):self.seq = sequenceself.pos += 1return Trueelse:return False
if line.startswith('>'):self.file_pointer.seek(self.file_pointer.tell() - len(li
ne))breaksequence += line.strip()self.seq = sequence if self.allow_mixed_case el
se sequence.upper()s
>temp_383|source:-*- coding: utf-8 -*-|start:748|stop:1248
d(self, id):self.output_file_obj.write('>%s\n' % id)def write_seq(self, seq, spl
it = True):if split:seq = self.split(seq)self.output_file_obj.write('%s\n' % seq
)def split(self, sequence, piece_length = 80):ticks = range(0, len(sequence), pi
ece_length) + [len(sequence)]return '\n'.join([sequence[ticks[x]:ticks[x + 1]] f
or x in range(0, len(ticks) - 1)])def close(self):self.output_file_obj.close()cl
ass ReadFasta:def __init__(self, f_name):self.ids = []self.sequences = []self.fa
sta = SequenceSource
>temp_384|source:-*- coding: utf-8 -*-|start:6634|stop:7134
ine = self.file_pointer.readline()if not line:if len(qualscores):self.quals = qu
alscores.strip()self.quals_int = [int(q) for q in self.quals.split()]self.pos +=
 1return Trueelse:return Falseif line.startswith('>'):self.file_pointer.seek(sel
f.file_pointer.tell() - len(line))breakqualscores += ' ' + line.strip()self.qual
s = qualscores.strip()self.quals_int = [int(q) for q in self.quals.split()]self.
pos += 1return Truedef close(self):self.file_pointer.close()def reset(self):self
.pos = 0self.id  = N
>temp_385|source:-*- coding: utf-8 -*-|start:5572|stop:6072
ubplot(gs[9])plt.rcParams.update({'axes.edgecolor' : 20})plt.grid(False)plt.ytic
ks([])plt.xticks([])plt.text(0.02, 0.5, 'total: %s / mean: %.2f / std: %.2f / mi
n: %s / max: %s'\% (len(sequence_lengths),numpy.mean(sequence_lengths), numpy.st
d(sequence_lengths),\min(sequence_lengths),\max(sequence_lengths)),\va = 'center
', alpha = 0.8, size = 'x-large')if dest == None:dest = self.fasta_file_pathtry:
plt.savefig(dest + '.pdf')except:plt.savefig(dest + '.png')try:plt.show()except:
passreturnclass Qual
>temp_386|source:-*- coding: utf-8 -*-|start:6807|stop:7307
else:return Falseif line.startswith('>'):self.file_pointer.seek(self.file_pointe
r.tell() - len(line))breakqualscores += ' ' + line.strip()self.quals = qualscore
s.strip()self.quals_int = [int(q) for q in self.quals.split()]self.pos += 1retur
n Truedef close(self):self.file_pointer.close()def reset(self):self.pos = 0self.
id  = Noneself.quals = Noneself.quals_int = Noneself.ids = []self.file_pointer.s
eek(0)if __name__ == '__main__':fasta = SequenceSource(sys.argv[1])fasta.visuali
ze_sequence_length_d
>temp_387|source:-*- coding: utf-8 -*-|start:2454|stop:2954
 = {'id' : self.id,'ids': [self.id],'seq': self.seq,'count': 1}self.unique_hash_
list = [i[1] for i in sorted([(self.unique_hash_dict[hash]['count'], hash)\for h
ash in self.unique_hash_dict], reverse = True)]self.total_unique = len(self.uniq
ue_hash_dict)self.reset()def next(self):if self.unique:return self.next_unique()
else:return self.next_regular()def next_unique(self):if self.unique:if self.tota
l_unique > 0 and self.pos < self.total_unique:hash_entry = self.unique_hash_dict
[self.unique_hash_li
>temp_388|source:-*- coding: utf-8 -*-|start:3006|stop:3506
'] if self.allow_mixed_case else hash_entry['seq'].upper()self.id  = hash_entry[
'id']self.ids = hash_entry['ids']return Trueelse:return Falseelse:return Falsede
f next_regular(self):self.seq = Noneself.id = self.file_pointer.readline()[1:].s
trip()sequence = ''while 1:line = self.file_pointer.readline()if not line:if len
(sequence):self.seq = sequenceself.pos += 1return Trueelse:return Falseif line.s
tartswith('>'):self.file_pointer.seek(self.file_pointer.tell() - len(line))break
sequence += line.str
>temp_389|source:-*- coding: utf-8 -*-|start:857|stop:1357
f.split(seq)self.output_file_obj.write('%s\n' % seq)def split(self, sequence, pi
ece_length = 80):ticks = range(0, len(sequence), piece_length) + [len(sequence)]
return '\n'.join([sequence[ticks[x]:ticks[x + 1]] for x in range(0, len(ticks) -
 1)])def close(self):self.output_file_obj.close()class ReadFasta:def __init__(se
lf, f_name):self.ids = []self.sequences = []self.fasta = SequenceSource(f_name)w
hile self.fasta.next():if self.fasta.pos % 1000 == 0 or self.fasta.pos == 1:sys.
stderr.write('\r[fas
>temp_390|source:-*- coding: utf-8 -*-|start:140|stop:640
the GNU General Public License as published by the Free# Software Foundation; ei
ther version 2 of the License, or (at your option)# any later version.## Please 
read the docs/COPYING file.import sysimport numpyimport hashlibclass FastaOutput
:def __init__(self, output_file_path):self.output_file_path = output_file_pathse
lf.output_file_obj = open(output_file_path, 'w')def store(self, entry, split = T
rue, store_frequencies = True):if entry.unique and store_frequencies:self.write_
id('%s|%s' % (entry.
>temp_391|source:-*- coding: utf-8 -*-|start:1505|stop:2005
derr.write('\n')def close(self):self.fasta.close()class SequenceSource():def __i
nit__(self, fasta_file_path, lazy_init = True, unique = False, allow_mixed_case 
= False):self.fasta_file_path = fasta_file_pathself.name = Noneself.lazy_init = 
lazy_initself.allow_mixed_case = allow_mixed_caseself.pos = 0self.id  = Noneself
.seq = Noneself.ids = []self.unique = uniqueself.unique_hash_dict = {}self.uniqu
e_hash_list = []self.unique_next_hash = 0self.file_pointer = open(self.fasta_fil
e_path)self.file_poi
>temp_392|source:-*- coding: utf-8 -*-|start:1033|stop:1533
([sequence[ticks[x]:ticks[x + 1]] for x in range(0, len(ticks) - 1)])def close(s
elf):self.output_file_obj.close()class ReadFasta:def __init__(self, f_name):self
.ids = []self.sequences = []self.fasta = SequenceSource(f_name)while self.fasta.
next():if self.fasta.pos % 1000 == 0 or self.fasta.pos == 1:sys.stderr.write('\r
[fastalib] Reading FASTA into memory: %s' % (self.fasta.pos))sys.stderr.flush()s
elf.ids.append(self.fasta.id)self.sequences.append(self.fasta.seq)sys.stderr.wri
te('\n')def close(se
>temp_393|source:-*- coding: utf-8 -*-|start:2360|stop:2860
ids'].append(self.id)self.unique_hash_dict[hash]['count'] += 1else:self.unique_h
ash_dict[hash] = {'id' : self.id,'ids': [self.id],'seq': self.seq,'count': 1}sel
f.unique_hash_list = [i[1] for i in sorted([(self.unique_hash_dict[hash]['count'
], hash)\for hash in self.unique_hash_dict], reverse = True)]self.total_unique =
 len(self.unique_hash_dict)self.reset()def next(self):if self.unique:return self
.next_unique()else:return self.next_regular()def next_unique(self):if self.uniqu
e:if self.total_uniq
>temp_394|source:-*- coding: utf-8 -*-|start:581|stop:1081
nique and store_frequencies:self.write_id('%s|%s' % (entry.id, 'frequency:%d' % 
len(entry.ids)))else:self.write_id(entry.id)self.write_seq(entry.seq, split)def 
write_id(self, id):self.output_file_obj.write('>%s\n' % id)def write_seq(self, s
eq, split = True):if split:seq = self.split(seq)self.output_file_obj.write('%s\n
' % seq)def split(self, sequence, piece_length = 80):ticks = range(0, len(sequen
ce), piece_length) + [len(sequence)]return '\n'.join([sequence[ticks[x]:ticks[x 
+ 1]] for x in range
>temp_395|source:-*- coding: utf-8 -*-|start:791|stop:1291
\n' % id)def write_seq(self, seq, split = True):if split:seq = self.split(seq)se
lf.output_file_obj.write('%s\n' % seq)def split(self, sequence, piece_length = 8
0):ticks = range(0, len(sequence), piece_length) + [len(sequence)]return '\n'.jo
in([sequence[ticks[x]:ticks[x + 1]] for x in range(0, len(ticks) - 1)])def close
(self):self.output_file_obj.close()class ReadFasta:def __init__(self, f_name):se
lf.ids = []self.sequences = []self.fasta = SequenceSource(f_name)while self.fast
a.next():if self.fas
>temp_396|source:-*- coding: utf-8 -*-|start:2357|stop:2857
]['ids'].append(self.id)self.unique_hash_dict[hash]['count'] += 1else:self.uniqu
e_hash_dict[hash] = {'id' : self.id,'ids': [self.id],'seq': self.seq,'count': 1}
self.unique_hash_list = [i[1] for i in sorted([(self.unique_hash_dict[hash]['cou
nt'], hash)\for hash in self.unique_hash_dict], reverse = True)]self.total_uniqu
e = len(self.unique_hash_dict)self.reset()def next(self):if self.unique:return s
elf.next_unique()else:return self.next_regular()def next_unique(self):if self.un
ique:if self.total_u
>temp_397|source:-*- coding: utf-8 -*-|start:2249|stop:2749
hash = hashlib.sha1(self.seq.upper()).hexdigest()if hash in self.unique_hash_dic
t:self.unique_hash_dict[hash]['ids'].append(self.id)self.unique_hash_dict[hash][
'count'] += 1else:self.unique_hash_dict[hash] = {'id' : self.id,'ids': [self.id]
,'seq': self.seq,'count': 1}self.unique_hash_list = [i[1] for i in sorted([(self
.unique_hash_dict[hash]['count'], hash)\for hash in self.unique_hash_dict], reve
rse = True)]self.total_unique = len(self.unique_hash_dict)self.reset()def next(s
elf):if self.unique:
>temp_398|source:-*- coding: utf-8 -*-|start:5128|stop:5628
ange(xtickstep, max_seq_len + 1, xtickstep), rotation=90, size='xx-small')plt.yt
icks(range(0, max(seq_len_distribution) + 1, ytickstep),[y for y in range(0, max
(seq_len_distribution) + 1, ytickstep)],size='xx-small')plt.xlim(xmin = 0, xmax 
= max_seq_len)plt.ylim(ymin = 0, ymax = max(seq_len_distribution) + (max(seq_len
_distribution) / 20.0))plt.figtext(0.5, 0.96, '%s' % (title), weight = 'black', 
size = 'xx-large', ha = 'center')ax1 = plt.subplot(gs[9])plt.rcParams.update({'a
xes.edgecolor' : 20}
>temp_399|source:-*- coding: utf-8 -*-|start:2942|stop:3442
ique_hash_list[self.pos]]self.pos += 1self.seq = hash_entry['seq'] if self.allow
_mixed_case else hash_entry['seq'].upper()self.id  = hash_entry['id']self.ids = 
hash_entry['ids']return Trueelse:return Falseelse:return Falsedef next_regular(s
elf):self.seq = Noneself.id = self.file_pointer.readline()[1:].strip()sequence =
 ''while 1:line = self.file_pointer.readline()if not line:if len(sequence):self.
seq = sequenceself.pos += 1return Trueelse:return Falseif line.startswith('>'):s
elf.file_pointer.see
>temp_400|source:-*- coding: utf-8 -*-|start:3281|stop:3781
elf.file_pointer.readline()if not line:if len(sequence):self.seq = sequenceself.
pos += 1return Trueelse:return Falseif line.startswith('>'):self.file_pointer.se
ek(self.file_pointer.tell() - len(line))breaksequence += line.strip()self.seq = 
sequence if self.allow_mixed_case else sequence.upper()self.pos += 1return Trued
ef get_seq_by_read_id(self, read_id):self.reset()while self.next():if self.id ==
 read_id:return self.seqreturn Falsedef close(self):self.file_pointer.close()def
 reset(self):self.po
>temp_401|source:-*- coding: utf-8 -*-|start:1359|stop:1859
lib] Reading FASTA into memory: %s' % (self.fasta.pos))sys.stderr.flush()self.id
s.append(self.fasta.id)self.sequences.append(self.fasta.seq)sys.stderr.write('\n
')def close(self):self.fasta.close()class SequenceSource():def __init__(self, fa
sta_file_path, lazy_init = True, unique = False, allow_mixed_case = False):self.
fasta_file_path = fasta_file_pathself.name = Noneself.lazy_init = lazy_initself.
allow_mixed_case = allow_mixed_caseself.pos = 0self.id  = Noneself.seq = Nonesel
f.ids = []self.uniqu
>temp_402|source:-*- coding: utf-8 -*-|start:5174|stop:5674
otation=90, size='xx-small')plt.yticks(range(0, max(seq_len_distribution) + 1, y
tickstep),[y for y in range(0, max(seq_len_distribution) + 1, ytickstep)],size='
xx-small')plt.xlim(xmin = 0, xmax = max_seq_len)plt.ylim(ymin = 0, ymax = max(se
q_len_distribution) + (max(seq_len_distribution) / 20.0))plt.figtext(0.5, 0.96, 
'%s' % (title), weight = 'black', size = 'xx-large', ha = 'center')ax1 = plt.sub
plot(gs[9])plt.rcParams.update({'axes.edgecolor' : 20})plt.grid(False)plt.yticks
([])plt.xticks([])pl
>temp_403|source:-*- coding: utf-8 -*-|start:6150|stop:6650
ath = quals_file_pathself.name = Noneself.lazy_init = lazy_initself.pos = 0self.
id  = Noneself.quals = Noneself.quals_int = Noneself.ids = []self.file_pointer =
 open(self.quals_file_path)self.file_pointer.seek(0)if self.lazy_init:self.total
_quals = Noneelse:self.total_quals = len([l for l in self.file_pointer.readlines
() if l.startswith('>')])self.reset()def next(self):self.id = self.file_pointer.
readline()[1:].strip()self.quals = Noneself.quals_int = Nonequalscores = ''while
 1:line = self.file_
>temp_404|source:-*- coding: utf-8 -*-|start:1933|stop:2433
next_hash = 0self.file_pointer = open(self.fasta_file_path)self.file_pointer.see
k(0)if self.lazy_init:self.total_seq = Noneelse:self.total_seq = len([l for l in
 self.file_pointer.readlines() if l.startswith('>')])self.reset()if self.unique:
self.init_unique_hash()def init_unique_hash(self):while self.next_regular():hash
 = hashlib.sha1(self.seq.upper()).hexdigest()if hash in self.unique_hash_dict:se
lf.unique_hash_dict[hash]['ids'].append(self.id)self.unique_hash_dict[hash]['cou
nt'] += 1else:self.u
>temp_405|source:-*- coding: utf-8 -*-|start:6029|stop:6529
')try:plt.show()except:passreturnclass QualSource:def __init__(self, quals_file_
path, lazy_init = True):self.quals_file_path = quals_file_pathself.name = Nonese
lf.lazy_init = lazy_initself.pos = 0self.id  = Noneself.quals = Noneself.quals_i
nt = Noneself.ids = []self.file_pointer = open(self.quals_file_path)self.file_po
inter.seek(0)if self.lazy_init:self.total_quals = Noneelse:self.total_quals = le
n([l for l in self.file_pointer.readlines() if l.startswith('>')])self.reset()de
f next(self):self.id
>temp_406|source:-*- coding: utf-8 -*-|start:5990|stop:6490
 '.pdf')except:plt.savefig(dest + '.png')try:plt.show()except:passreturnclass Qu
alSource:def __init__(self, quals_file_path, lazy_init = True):self.quals_file_p
ath = quals_file_pathself.name = Noneself.lazy_init = lazy_initself.pos = 0self.
id  = Noneself.quals = Noneself.quals_int = Noneself.ids = []self.file_pointer =
 open(self.quals_file_path)self.file_pointer.seek(0)if self.lazy_init:self.total
_quals = Noneelse:self.total_quals = len([l for l in self.file_pointer.readlines
() if l.startswith('
>temp_407|source:-*- coding: utf-8 -*-|start:5744|stop:6244
 %s'\% (len(sequence_lengths),numpy.mean(sequence_lengths), numpy.std(sequence_l
engths),\min(sequence_lengths),\max(sequence_lengths)),\va = 'center', alpha = 0
.8, size = 'x-large')if dest == None:dest = self.fasta_file_pathtry:plt.savefig(
dest + '.pdf')except:plt.savefig(dest + '.png')try:plt.show()except:passreturncl
ass QualSource:def __init__(self, quals_file_path, lazy_init = True):self.quals_
file_path = quals_file_pathself.name = Noneself.lazy_init = lazy_initself.pos = 
0self.id  = Noneself
>temp_408|source:-*- coding: utf-8 -*-|start:3864|stop:4364
ize_sequence_length_distribution(self, title, dest = None, max_seq_len = None, x
tickstep = None, ytickstep = None):import matplotlib.pyplot as pltimport matplot
lib.gridspec as gridspecsequence_lengths = []self.reset()while self.next():if se
lf.pos % 10000 == 0 or self.pos == 1:sys.stderr.write('\r[fastalib] Reading: %s'
 % (self.pos))sys.stderr.flush()sequence_lengths.append(len(self.seq))self.reset
()sys.stderr.write('\n')if not max_seq_len:max_seq_len = max(sequence_lengths) +
 (int(max(sequence_l
>temp_409|source:-*- coding: utf-8 -*-|start:4362|stop:4862
_lengths) / 100.0) or 10)seq_len_distribution = [0] * (max_seq_len + 1)for l in 
sequence_lengths:seq_len_distribution[l] += 1fig = plt.figure(figsize = (16, 12)
)plt.rcParams.update({'axes.linewidth' : 0.9})plt.rc('grid', color='0.50', lines
tyle='-', linewidth=0.1)gs = gridspec.GridSpec(10, 1)ax1 = plt.subplot(gs[0:8])p
lt.grid(True)plt.subplots_adjust(left=0.05, bottom = 0.03, top = 0.95, right = 0
.98)plt.plot(seq_len_distribution, color = 'black', alpha = 0.3)plt.fill_between
(range(0, max_seq_le
>temp_410|source:-*- coding: utf-8 -*-|start:4461|stop:4961
q_len_distribution[l] += 1fig = plt.figure(figsize = (16, 12))plt.rcParams.updat
e({'axes.linewidth' : 0.9})plt.rc('grid', color='0.50', linestyle='-', linewidth
=0.1)gs = gridspec.GridSpec(10, 1)ax1 = plt.subplot(gs[0:8])plt.grid(True)plt.su
bplots_adjust(left=0.05, bottom = 0.03, top = 0.95, right = 0.98)plt.plot(seq_le
n_distribution, color = 'black', alpha = 0.3)plt.fill_between(range(0, max_seq_l
en + 1), seq_len_distribution, y2 = 0, color = 'black', alpha = 0.15)plt.ylabel(
'number of sequences
>temp_411|source:-*- coding: utf-8 -*-|start:5819|stop:6319
nce_lengths),\min(sequence_lengths),\max(sequence_lengths)),\va = 'center', alph
a = 0.8, size = 'x-large')if dest == None:dest = self.fasta_file_pathtry:plt.sav
efig(dest + '.pdf')except:plt.savefig(dest + '.png')try:plt.show()except:passret
urnclass QualSource:def __init__(self, quals_file_path, lazy_init = True):self.q
uals_file_path = quals_file_pathself.name = Noneself.lazy_init = lazy_initself.p
os = 0self.id  = Noneself.quals = Noneself.quals_int = Noneself.ids = []self.fil
e_pointer = open(sel
>temp_412|source:-*- coding: utf-8 -*-|start:1841|stop:2341
ids = []self.unique = uniqueself.unique_hash_dict = {}self.unique_hash_list = []
self.unique_next_hash = 0self.file_pointer = open(self.fasta_file_path)self.file
_pointer.seek(0)if self.lazy_init:self.total_seq = Noneelse:self.total_seq = len
([l for l in self.file_pointer.readlines() if l.startswith('>')])self.reset()if 
self.unique:self.init_unique_hash()def init_unique_hash(self):while self.next_re
gular():hash = hashlib.sha1(self.seq.upper()).hexdigest()if hash in self.unique_
hash_dict:self.uniqu
>temp_413|source:-*- coding: utf-8 -*-|start:1581|stop:2081
 __init__(self, fasta_file_path, lazy_init = True, unique = False, allow_mixed_c
ase = False):self.fasta_file_path = fasta_file_pathself.name = Noneself.lazy_ini
t = lazy_initself.allow_mixed_case = allow_mixed_caseself.pos = 0self.id  = None
self.seq = Noneself.ids = []self.unique = uniqueself.unique_hash_dict = {}self.u
nique_hash_list = []self.unique_next_hash = 0self.file_pointer = open(self.fasta
_file_path)self.file_pointer.seek(0)if self.lazy_init:self.total_seq = Noneelse:
self.total_seq = len
>temp_414|source:-*- coding: utf-8 -*-|start:6500|stop:7000
reset()def next(self):self.id = self.file_pointer.readline()[1:].strip()self.qua
ls = Noneself.quals_int = Nonequalscores = ''while 1:line = self.file_pointer.re
adline()if not line:if len(qualscores):self.quals = qualscores.strip()self.quals
_int = [int(q) for q in self.quals.split()]self.pos += 1return Trueelse:return F
alseif line.startswith('>'):self.file_pointer.seek(self.file_pointer.tell() - le
n(line))breakqualscores += ' ' + line.strip()self.quals = qualscores.strip()self
.quals_int = [int(q)
>temp_415|source:-*- coding: utf-8 -*-|start:2146|stop:2646
self.reset()if self.unique:self.init_unique_hash()def init_unique_hash(self):whi
le self.next_regular():hash = hashlib.sha1(self.seq.upper()).hexdigest()if hash 
in self.unique_hash_dict:self.unique_hash_dict[hash]['ids'].append(self.id)self.
unique_hash_dict[hash]['count'] += 1else:self.unique_hash_dict[hash] = {'id' : s
elf.id,'ids': [self.id],'seq': self.seq,'count': 1}self.unique_hash_list = [i[1]
 for i in sorted([(self.unique_hash_dict[hash]['count'], hash)\for hash in self.
unique_hash_dict], r
>temp_416|source:-*- coding: utf-8 -*-|start:412|stop:912
path):self.output_file_path = output_file_pathself.output_file_obj = open(output
_file_path, 'w')def store(self, entry, split = True, store_frequencies = True):i
f entry.unique and store_frequencies:self.write_id('%s|%s' % (entry.id, 'frequen
cy:%d' % len(entry.ids)))else:self.write_id(entry.id)self.write_seq(entry.seq, s
plit)def write_id(self, id):self.output_file_obj.write('>%s\n' % id)def write_se
q(self, seq, split = True):if split:seq = self.split(seq)self.output_file_obj.wr
ite('%s\n' % seq)def
>temp_417|source:-*- coding: utf-8 -*-|start:5799|stop:6299
hs), numpy.std(sequence_lengths),\min(sequence_lengths),\max(sequence_lengths)),
\va = 'center', alpha = 0.8, size = 'x-large')if dest == None:dest = self.fasta_
file_pathtry:plt.savefig(dest + '.pdf')except:plt.savefig(dest + '.png')try:plt.
show()except:passreturnclass QualSource:def __init__(self, quals_file_path, lazy
_init = True):self.quals_file_path = quals_file_pathself.name = Noneself.lazy_in
it = lazy_initself.pos = 0self.id  = Noneself.quals = Noneself.quals_int = Nones
elf.ids = []self.fil
>temp_418|source:-*- coding: utf-8 -*-|start:6795|stop:7295
1return Trueelse:return Falseif line.startswith('>'):self.file_pointer.seek(self
.file_pointer.tell() - len(line))breakqualscores += ' ' + line.strip()self.quals
 = qualscores.strip()self.quals_int = [int(q) for q in self.quals.split()]self.p
os += 1return Truedef close(self):self.file_pointer.close()def reset(self):self.
pos = 0self.id  = Noneself.quals = Noneself.quals_int = Noneself.ids = []self.fi
le_pointer.seek(0)if __name__ == '__main__':fasta = SequenceSource(sys.argv[1])f
asta.visualize_seque
>temp_419|source:-*- coding: utf-8 -*-|start:1474|stop:1974
es.append(self.fasta.seq)sys.stderr.write('\n')def close(self):self.fasta.close(
)class SequenceSource():def __init__(self, fasta_file_path, lazy_init = True, un
ique = False, allow_mixed_case = False):self.fasta_file_path = fasta_file_pathse
lf.name = Noneself.lazy_init = lazy_initself.allow_mixed_case = allow_mixed_case
self.pos = 0self.id  = Noneself.seq = Noneself.ids = []self.unique = uniqueself.
unique_hash_dict = {}self.unique_hash_list = []self.unique_next_hash = 0self.fil
e_pointer = open(sel
>temp_420|source:-*- coding: utf-8 -*-|start:605|stop:1105
ies:self.write_id('%s|%s' % (entry.id, 'frequency:%d' % len(entry.ids)))else:sel
f.write_id(entry.id)self.write_seq(entry.seq, split)def write_id(self, id):self.
output_file_obj.write('>%s\n' % id)def write_seq(self, seq, split = True):if spl
it:seq = self.split(seq)self.output_file_obj.write('%s\n' % seq)def split(self, 
sequence, piece_length = 80):ticks = range(0, len(sequence), piece_length) + [le
n(sequence)]return '\n'.join([sequence[ticks[x]:ticks[x + 1]] for x in range(0, 
len(ticks) - 1)])def
>temp_421|source:-*- coding: utf-8 -*-|start:467|stop:967
ut_file_obj = open(output_file_path, 'w')def store(self, entry, split = True, st
ore_frequencies = True):if entry.unique and store_frequencies:self.write_id('%s|
%s' % (entry.id, 'frequency:%d' % len(entry.ids)))else:self.write_id(entry.id)se
lf.write_seq(entry.seq, split)def write_id(self, id):self.output_file_obj.write(
'>%s\n' % id)def write_seq(self, seq, split = True):if split:seq = self.split(se
q)self.output_file_obj.write('%s\n' % seq)def split(self, sequence, piece_length
 = 80):ticks = range
>temp_422|source:-*- coding: utf-8 -*-|start:2180|stop:2680
it_unique_hash()def init_unique_hash(self):while self.next_regular():hash = hash
lib.sha1(self.seq.upper()).hexdigest()if hash in self.unique_hash_dict:self.uniq
ue_hash_dict[hash]['ids'].append(self.id)self.unique_hash_dict[hash]['count'] +=
 1else:self.unique_hash_dict[hash] = {'id' : self.id,'ids': [self.id],'seq': sel
f.seq,'count': 1}self.unique_hash_list = [i[1] for i in sorted([(self.unique_has
h_dict[hash]['count'], hash)\for hash in self.unique_hash_dict], reverse = True)
]self.total_unique =
>temp_423|source:-*- coding: utf-8 -*-|start:1217|stop:1717
= []self.fasta = SequenceSource(f_name)while self.fasta.next():if self.fasta.pos
 % 1000 == 0 or self.fasta.pos == 1:sys.stderr.write('\r[fastalib] Reading FASTA
 into memory: %s' % (self.fasta.pos))sys.stderr.flush()self.ids.append(self.fast
a.id)self.sequences.append(self.fasta.seq)sys.stderr.write('\n')def close(self):
self.fasta.close()class SequenceSource():def __init__(self, fasta_file_path, laz
y_init = True, unique = False, allow_mixed_case = False):self.fasta_file_path = 
fasta_file_pathself.
>temp_424|source:-*- coding: utf-8 -*-|start:5788|stop:6288
uence_lengths), numpy.std(sequence_lengths),\min(sequence_lengths),\max(sequence
_lengths)),\va = 'center', alpha = 0.8, size = 'x-large')if dest == None:dest = 
self.fasta_file_pathtry:plt.savefig(dest + '.pdf')except:plt.savefig(dest + '.pn
g')try:plt.show()except:passreturnclass QualSource:def __init__(self, quals_file
_path, lazy_init = True):self.quals_file_path = quals_file_pathself.name = Nones
elf.lazy_init = lazy_initself.pos = 0self.id  = Noneself.quals = Noneself.quals_
int = Noneself.ids =
>temp_425|source:-*- coding: utf-8 -*-|start:3583|stop:4083
s += 1return Truedef get_seq_by_read_id(self, read_id):self.reset()while self.ne
xt():if self.id == read_id:return self.seqreturn Falsedef close(self):self.file_
pointer.close()def reset(self):self.pos = 0self.id  = Noneself.seq = Noneself.id
s = []self.file_pointer.seek(0)def visualize_sequence_length_distribution(self, 
title, dest = None, max_seq_len = None, xtickstep = None, ytickstep = None):impo
rt matplotlib.pyplot as pltimport matplotlib.gridspec as gridspecsequence_length
s = []self.reset()wh
>temp_426|source:-*- coding: utf-8 -*-|start:714|stop:1214
e_seq(entry.seq, split)def write_id(self, id):self.output_file_obj.write('>%s\n'
 % id)def write_seq(self, seq, split = True):if split:seq = self.split(seq)self.
output_file_obj.write('%s\n' % seq)def split(self, sequence, piece_length = 80):
ticks = range(0, len(sequence), piece_length) + [len(sequence)]return '\n'.join(
[sequence[ticks[x]:ticks[x + 1]] for x in range(0, len(ticks) - 1)])def close(se
lf):self.output_file_obj.close()class ReadFasta:def __init__(self, f_name):self.
ids = []self.sequenc
>temp_427|source:-*- coding: utf-8 -*-|start:6682|stop:7182
 len(qualscores):self.quals = qualscores.strip()self.quals_int = [int(q) for q i
n self.quals.split()]self.pos += 1return Trueelse:return Falseif line.startswith
('>'):self.file_pointer.seek(self.file_pointer.tell() - len(line))breakqualscore
s += ' ' + line.strip()self.quals = qualscores.strip()self.quals_int = [int(q) f
or q in self.quals.split()]self.pos += 1return Truedef close(self):self.file_poi
nter.close()def reset(self):self.pos = 0self.id  = Noneself.quals = Noneself.qua
ls_int = Noneself.id
>temp_428|source:-*- coding: utf-8 -*-|start:3595|stop:4095
 Truedef get_seq_by_read_id(self, read_id):self.reset()while self.next():if self
.id == read_id:return self.seqreturn Falsedef close(self):self.file_pointer.clos
e()def reset(self):self.pos = 0self.id  = Noneself.seq = Noneself.ids = []self.f
ile_pointer.seek(0)def visualize_sequence_length_distribution(self, title, dest 
= None, max_seq_len = None, xtickstep = None, ytickstep = None):import matplotli
b.pyplot as pltimport matplotlib.gridspec as gridspecsequence_lengths = []self.r
eset()while self.nex
>temp_429|source:-*- coding: utf-8 -*-|start:5401|stop:5901
ymax = max(seq_len_distribution) + (max(seq_len_distribution) / 20.0))plt.figtex
t(0.5, 0.96, '%s' % (title), weight = 'black', size = 'xx-large', ha = 'center')
ax1 = plt.subplot(gs[9])plt.rcParams.update({'axes.edgecolor' : 20})plt.grid(Fal
se)plt.yticks([])plt.xticks([])plt.text(0.02, 0.5, 'total: %s / mean: %.2f / std
: %.2f / min: %s / max: %s'\% (len(sequence_lengths),numpy.mean(sequence_lengths
), numpy.std(sequence_lengths),\min(sequence_lengths),\max(sequence_lengths)),\v
a = 'center', alpha 
>temp_430|source:-*- coding: utf-8 -*-|start:1856|stop:2356
ique = uniqueself.unique_hash_dict = {}self.unique_hash_list = []self.unique_nex
t_hash = 0self.file_pointer = open(self.fasta_file_path)self.file_pointer.seek(0
)if self.lazy_init:self.total_seq = Noneelse:self.total_seq = len([l for l in se
lf.file_pointer.readlines() if l.startswith('>')])self.reset()if self.unique:sel
f.init_unique_hash()def init_unique_hash(self):while self.next_regular():hash = 
hashlib.sha1(self.seq.upper()).hexdigest()if hash in self.unique_hash_dict:self.
unique_hash_dict[has
>temp_431|source:-*- coding: utf-8 -*-|start:2193|stop:2693
h()def init_unique_hash(self):while self.next_regular():hash = hashlib.sha1(self
.seq.upper()).hexdigest()if hash in self.unique_hash_dict:self.unique_hash_dict[
hash]['ids'].append(self.id)self.unique_hash_dict[hash]['count'] += 1else:self.u
nique_hash_dict[hash] = {'id' : self.id,'ids': [self.id],'seq': self.seq,'count'
: 1}self.unique_hash_list = [i[1] for i in sorted([(self.unique_hash_dict[hash][
'count'], hash)\for hash in self.unique_hash_dict], reverse = True)]self.total_u
nique = len(self.uni
>temp_432|source:-*- coding: utf-8 -*-|start:543|stop:1043
, store_frequencies = True):if entry.unique and store_frequencies:self.write_id(
'%s|%s' % (entry.id, 'frequency:%d' % len(entry.ids)))else:self.write_id(entry.i
d)self.write_seq(entry.seq, split)def write_id(self, id):self.output_file_obj.wr
ite('>%s\n' % id)def write_seq(self, seq, split = True):if split:seq = self.spli
t(seq)self.output_file_obj.write('%s\n' % seq)def split(self, sequence, piece_le
ngth = 80):ticks = range(0, len(sequence), piece_length) + [len(sequence)]return
 '\n'.join([sequence
>temp_433|source:-*- coding: utf-8 -*-|start:1751|stop:2251
nitself.allow_mixed_case = allow_mixed_caseself.pos = 0self.id  = Noneself.seq =
 Noneself.ids = []self.unique = uniqueself.unique_hash_dict = {}self.unique_hash
_list = []self.unique_next_hash = 0self.file_pointer = open(self.fasta_file_path
)self.file_pointer.seek(0)if self.lazy_init:self.total_seq = Noneelse:self.total
_seq = len([l for l in self.file_pointer.readlines() if l.startswith('>')])self.
reset()if self.unique:self.init_unique_hash()def init_unique_hash(self):while se
lf.next_regular():ha
>temp_434|source:-*- coding: utf-8 -*-|start:3223|stop:3723
ointer.readline()[1:].strip()sequence = ''while 1:line = self.file_pointer.readl
ine()if not line:if len(sequence):self.seq = sequenceself.pos += 1return Trueels
e:return Falseif line.startswith('>'):self.file_pointer.seek(self.file_pointer.t
ell() - len(line))breaksequence += line.strip()self.seq = sequence if self.allow
_mixed_case else sequence.upper()self.pos += 1return Truedef get_seq_by_read_id(
self, read_id):self.reset()while self.next():if self.id == read_id:return self.s
eqreturn Falsedef cl
>temp_435|source:-*- coding: utf-8 -*-|start:5091|stop:5591
n_distribution) / 20 or 1plt.xticks(range(xtickstep, max_seq_len + 1, xtickstep)
, rotation=90, size='xx-small')plt.yticks(range(0, max(seq_len_distribution) + 1
, ytickstep),[y for y in range(0, max(seq_len_distribution) + 1, ytickstep)],siz
e='xx-small')plt.xlim(xmin = 0, xmax = max_seq_len)plt.ylim(ymin = 0, ymax = max
(seq_len_distribution) + (max(seq_len_distribution) / 20.0))plt.figtext(0.5, 0.9
6, '%s' % (title), weight = 'black', size = 'xx-large', ha = 'center')ax1 = plt.
subplot(gs[9])plt.rc
>temp_436|source:-*- coding: utf-8 -*-|start:2580|stop:3080
h_dict[hash]['count'], hash)\for hash in self.unique_hash_dict], reverse = True)
]self.total_unique = len(self.unique_hash_dict)self.reset()def next(self):if sel
f.unique:return self.next_unique()else:return self.next_regular()def next_unique
(self):if self.unique:if self.total_unique > 0 and self.pos < self.total_unique:
hash_entry = self.unique_hash_dict[self.unique_hash_list[self.pos]]self.pos += 1
self.seq = hash_entry['seq'] if self.allow_mixed_case else hash_entry['seq'].upp
er()self.id  = hash_
>temp_437|source:-*- coding: utf-8 -*-|start:3995|stop:4495
b.pyplot as pltimport matplotlib.gridspec as gridspecsequence_lengths = []self.r
eset()while self.next():if self.pos % 10000 == 0 or self.pos == 1:sys.stderr.wri
te('\r[fastalib] Reading: %s' % (self.pos))sys.stderr.flush()sequence_lengths.ap
pend(len(self.seq))self.reset()sys.stderr.write('\n')if not max_seq_len:max_seq_
len = max(sequence_lengths) + (int(max(sequence_lengths) / 100.0) or 10)seq_len_
distribution = [0] * (max_seq_len + 1)for l in sequence_lengths:seq_len_distribu
tion[l] += 1fig = pl
>temp_438|source:-*- coding: utf-8 -*-|start:5767|stop:6267
ngths),numpy.mean(sequence_lengths), numpy.std(sequence_lengths),\min(sequence_l
engths),\max(sequence_lengths)),\va = 'center', alpha = 0.8, size = 'x-large')if
 dest == None:dest = self.fasta_file_pathtry:plt.savefig(dest + '.pdf')except:pl
t.savefig(dest + '.png')try:plt.show()except:passreturnclass QualSource:def __in
it__(self, quals_file_path, lazy_init = True):self.quals_file_path = quals_file_
pathself.name = Noneself.lazy_init = lazy_initself.pos = 0self.id  = Noneself.qu
als = Noneself.quals
>temp_439|source:-*- coding: utf-8 -*-|start:1001|stop:1501
 [len(sequence)]return '\n'.join([sequence[ticks[x]:ticks[x + 1]] for x in range
(0, len(ticks) - 1)])def close(self):self.output_file_obj.close()class ReadFasta
:def __init__(self, f_name):self.ids = []self.sequences = []self.fasta = Sequenc
eSource(f_name)while self.fasta.next():if self.fasta.pos % 1000 == 0 or self.fas
ta.pos == 1:sys.stderr.write('\r[fastalib] Reading FASTA into memory: %s' % (sel
f.fasta.pos))sys.stderr.flush()self.ids.append(self.fasta.id)self.sequences.appe
nd(self.fasta.seq)sy
>temp_440|source:-*- coding: utf-8 -*-|start:4094|stop:4594
xt():if self.pos % 10000 == 0 or self.pos == 1:sys.stderr.write('\r[fastalib] Re
ading: %s' % (self.pos))sys.stderr.flush()sequence_lengths.append(len(self.seq))
self.reset()sys.stderr.write('\n')if not max_seq_len:max_seq_len = max(sequence_
lengths) + (int(max(sequence_lengths) / 100.0) or 10)seq_len_distribution = [0] 
* (max_seq_len + 1)for l in sequence_lengths:seq_len_distribution[l] += 1fig = p
lt.figure(figsize = (16, 12))plt.rcParams.update({'axes.linewidth' : 0.9})plt.rc
('grid', color='0.50
>temp_441|source:-*- coding: utf-8 -*-|start:9|stop:509
6# Copyright (C) 2014, A. Murat Eren## This program is free software; you can re
distribute it and/or modify it under# the terms of the GNU General Public Licens
e as published by the Free# Software Foundation; either version 2 of the License
, or (at your option)# any later version.## Please read the docs/COPYING file.im
port sysimport numpyimport hashlibclass FastaOutput:def __init__(self, output_fi
le_path):self.output_file_path = output_file_pathself.output_file_obj = open(out
put_file_path, 'w')d
>temp_442|source:-*- coding: utf-8 -*-|start:1131|stop:1631
ile_obj.close()class ReadFasta:def __init__(self, f_name):self.ids = []self.sequ
ences = []self.fasta = SequenceSource(f_name)while self.fasta.next():if self.fas
ta.pos % 1000 == 0 or self.fasta.pos == 1:sys.stderr.write('\r[fastalib] Reading
 FASTA into memory: %s' % (self.fasta.pos))sys.stderr.flush()self.ids.append(sel
f.fasta.id)self.sequences.append(self.fasta.seq)sys.stderr.write('\n')def close(
self):self.fasta.close()class SequenceSource():def __init__(self, fasta_file_pat
h, lazy_init = True,
>temp_443|source:-*- coding: utf-8 -*-|start:2483|stop:2983
lf.id],'seq': self.seq,'count': 1}self.unique_hash_list = [i[1] for i in sorted(
[(self.unique_hash_dict[hash]['count'], hash)\for hash in self.unique_hash_dict]
, reverse = True)]self.total_unique = len(self.unique_hash_dict)self.reset()def 
next(self):if self.unique:return self.next_unique()else:return self.next_regular
()def next_unique(self):if self.unique:if self.total_unique > 0 and self.pos < s
elf.total_unique:hash_entry = self.unique_hash_dict[self.unique_hash_list[self.p
os]]self.pos += 1sel
>temp_444|source:-*- coding: utf-8 -*-|start:6554|stop:7054
line()[1:].strip()self.quals = Noneself.quals_int = Nonequalscores = ''while 1:l
ine = self.file_pointer.readline()if not line:if len(qualscores):self.quals = qu
alscores.strip()self.quals_int = [int(q) for q in self.quals.split()]self.pos +=
 1return Trueelse:return Falseif line.startswith('>'):self.file_pointer.seek(sel
f.file_pointer.tell() - len(line))breakqualscores += ' ' + line.strip()self.qual
s = qualscores.strip()self.quals_int = [int(q) for q in self.quals.split()]self.
pos += 1return Trued
>temp_445|source:-*- coding: utf-8 -*-|start:344|stop:844
numpyimport hashlibclass FastaOutput:def __init__(self, output_file_path):self.o
utput_file_path = output_file_pathself.output_file_obj = open(output_file_path, 
'w')def store(self, entry, split = True, store_frequencies = True):if entry.uniq
ue and store_frequencies:self.write_id('%s|%s' % (entry.id, 'frequency:%d' % len
(entry.ids)))else:self.write_id(entry.id)self.write_seq(entry.seq, split)def wri
te_id(self, id):self.output_file_obj.write('>%s\n' % id)def write_seq(self, seq,
 split = True):if sp
>temp_446|source:-*- coding: utf-8 -*-|start:3806|stop:4306
seq = Noneself.ids = []self.file_pointer.seek(0)def visualize_sequence_length_di
stribution(self, title, dest = None, max_seq_len = None, xtickstep = None, ytick
step = None):import matplotlib.pyplot as pltimport matplotlib.gridspec as gridsp
ecsequence_lengths = []self.reset()while self.next():if self.pos % 10000 == 0 or
 self.pos == 1:sys.stderr.write('\r[fastalib] Reading: %s' % (self.pos))sys.stde
rr.flush()sequence_lengths.append(len(self.seq))self.reset()sys.stderr.write('\n
')if not max_seq_len
>temp_447|source:-*- coding: utf-8 -*-|start:439|stop:939
 = output_file_pathself.output_file_obj = open(output_file_path, 'w')def store(s
elf, entry, split = True, store_frequencies = True):if entry.unique and store_fr
equencies:self.write_id('%s|%s' % (entry.id, 'frequency:%d' % len(entry.ids)))el
se:self.write_id(entry.id)self.write_seq(entry.seq, split)def write_id(self, id)
:self.output_file_obj.write('>%s\n' % id)def write_seq(self, seq, split = True):
if split:seq = self.split(seq)self.output_file_obj.write('%s\n' % seq)def split(
self, sequence, piec
>temp_448|source:-*- coding: utf-8 -*-|start:3741|stop:4241
e_pointer.close()def reset(self):self.pos = 0self.id  = Noneself.seq = Noneself.
ids = []self.file_pointer.seek(0)def visualize_sequence_length_distribution(self
, title, dest = None, max_seq_len = None, xtickstep = None, ytickstep = None):im
port matplotlib.pyplot as pltimport matplotlib.gridspec as gridspecsequence_leng
ths = []self.reset()while self.next():if self.pos % 10000 == 0 or self.pos == 1:
sys.stderr.write('\r[fastalib] Reading: %s' % (self.pos))sys.stderr.flush()seque
nce_lengths.append(l
>temp_449|source:-*- coding: utf-8 -*-|start:4175|stop:4675
ding: %s' % (self.pos))sys.stderr.flush()sequence_lengths.append(len(self.seq))s
elf.reset()sys.stderr.write('\n')if not max_seq_len:max_seq_len = max(sequence_l
engths) + (int(max(sequence_lengths) / 100.0) or 10)seq_len_distribution = [0] *
 (max_seq_len + 1)for l in sequence_lengths:seq_len_distribution[l] += 1fig = pl
t.figure(figsize = (16, 12))plt.rcParams.update({'axes.linewidth' : 0.9})plt.rc(
'grid', color='0.50', linestyle='-', linewidth=0.1)gs = gridspec.GridSpec(10, 1)
ax1 = plt.subplot(gs
>temp_450|source:-*- coding: utf-8 -*-|start:307|stop:807
e docs/COPYING file.import sysimport numpyimport hashlibclass FastaOutput:def __
init__(self, output_file_path):self.output_file_path = output_file_pathself.outp
ut_file_obj = open(output_file_path, 'w')def store(self, entry, split = True, st
ore_frequencies = True):if entry.unique and store_frequencies:self.write_id('%s|
%s' % (entry.id, 'frequency:%d' % len(entry.ids)))else:self.write_id(entry.id)se
lf.write_seq(entry.seq, split)def write_id(self, id):self.output_file_obj.write(
'>%s\n' % id)def wri
>temp_451|source:-*- coding: utf-8 -*-|start:6749|stop:7249
nt(q) for q in self.quals.split()]self.pos += 1return Trueelse:return Falseif li
ne.startswith('>'):self.file_pointer.seek(self.file_pointer.tell() - len(line))b
reakqualscores += ' ' + line.strip()self.quals = qualscores.strip()self.quals_in
t = [int(q) for q in self.quals.split()]self.pos += 1return Truedef close(self):
self.file_pointer.close()def reset(self):self.pos = 0self.id  = Noneself.quals =
 Noneself.quals_int = Noneself.ids = []self.file_pointer.seek(0)if __name__ == '
__main__':fasta = Se
>temp_452|source:-*- coding: utf-8 -*-|start:3248|stop:3748
ip()sequence = ''while 1:line = self.file_pointer.readline()if not line:if len(s
equence):self.seq = sequenceself.pos += 1return Trueelse:return Falseif line.sta
rtswith('>'):self.file_pointer.seek(self.file_pointer.tell() - len(line))breakse
quence += line.strip()self.seq = sequence if self.allow_mixed_case else sequence
.upper()self.pos += 1return Truedef get_seq_by_read_id(self, read_id):self.reset
()while self.next():if self.id == read_id:return self.seqreturn Falsedef close(s
elf):self.file_point
>temp_453|source:-*- coding: utf-8 -*-|start:4877|stop:5377
_distribution, y2 = 0, color = 'black', alpha = 0.15)plt.ylabel('number of seque
nces')plt.xlabel('sequence length')if xtickstep == None:xtickstep = (max_seq_len
 / 50) or 1if ytickstep == None:ytickstep = max(seq_len_distribution) / 20 or 1p
lt.xticks(range(xtickstep, max_seq_len + 1, xtickstep), rotation=90, size='xx-sm
all')plt.yticks(range(0, max(seq_len_distribution) + 1, ytickstep),[y for y in r
ange(0, max(seq_len_distribution) + 1, ytickstep)],size='xx-small')plt.xlim(xmin
 = 0, xmax = max_seq
>temp_454|source:-*- coding: utf-8 -*-|start:3794|stop:4294
 = Noneself.seq = Noneself.ids = []self.file_pointer.seek(0)def visualize_sequen
ce_length_distribution(self, title, dest = None, max_seq_len = None, xtickstep =
 None, ytickstep = None):import matplotlib.pyplot as pltimport matplotlib.gridsp
ec as gridspecsequence_lengths = []self.reset()while self.next():if self.pos % 1
0000 == 0 or self.pos == 1:sys.stderr.write('\r[fastalib] Reading: %s' % (self.p
os))sys.stderr.flush()sequence_lengths.append(len(self.seq))self.reset()sys.stde
rr.write('\n')if not
>temp_455|source:-*- coding: utf-8 -*-|start:5236|stop:5736
stribution) + 1, ytickstep),[y for y in range(0, max(seq_len_distribution) + 1, 
ytickstep)],size='xx-small')plt.xlim(xmin = 0, xmax = max_seq_len)plt.ylim(ymin 
= 0, ymax = max(seq_len_distribution) + (max(seq_len_distribution) / 20.0))plt.f
igtext(0.5, 0.96, '%s' % (title), weight = 'black', size = 'xx-large', ha = 'cen
ter')ax1 = plt.subplot(gs[9])plt.rcParams.update({'axes.edgecolor' : 20})plt.gri
d(False)plt.yticks([])plt.xticks([])plt.text(0.02, 0.5, 'total: %s / mean: %.2f 
/ std: %.2f / min: %
>temp_456|source:-*- coding: utf-8 -*-|start:5846|stop:6346
lengths),\max(sequence_lengths)),\va = 'center', alpha = 0.8, size = 'x-large')i
f dest == None:dest = self.fasta_file_pathtry:plt.savefig(dest + '.pdf')except:p
lt.savefig(dest + '.png')try:plt.show()except:passreturnclass QualSource:def __i
nit__(self, quals_file_path, lazy_init = True):self.quals_file_path = quals_file
_pathself.name = Noneself.lazy_init = lazy_initself.pos = 0self.id  = Noneself.q
uals = Noneself.quals_int = Noneself.ids = []self.file_pointer = open(self.quals
_file_path)self.file
>temp_457|source:-*- coding: utf-8 -*-|start:5104|stop:5604
n) / 20 or 1plt.xticks(range(xtickstep, max_seq_len + 1, xtickstep), rotation=90
, size='xx-small')plt.yticks(range(0, max(seq_len_distribution) + 1, ytickstep),
[y for y in range(0, max(seq_len_distribution) + 1, ytickstep)],size='xx-small')
plt.xlim(xmin = 0, xmax = max_seq_len)plt.ylim(ymin = 0, ymax = max(seq_len_dist
ribution) + (max(seq_len_distribution) / 20.0))plt.figtext(0.5, 0.96, '%s' % (ti
tle), weight = 'black', size = 'xx-large', ha = 'center')ax1 = plt.subplot(gs[9]
)plt.rcParams.update
>temp_458|source:-*- coding: utf-8 -*-|start:881|stop:1381
file_obj.write('%s\n' % seq)def split(self, sequence, piece_length = 80):ticks =
 range(0, len(sequence), piece_length) + [len(sequence)]return '\n'.join([sequen
ce[ticks[x]:ticks[x + 1]] for x in range(0, len(ticks) - 1)])def close(self):sel
f.output_file_obj.close()class ReadFasta:def __init__(self, f_name):self.ids = [
]self.sequences = []self.fasta = SequenceSource(f_name)while self.fasta.next():i
f self.fasta.pos % 1000 == 0 or self.fasta.pos == 1:sys.stderr.write('\r[fastali
b] Reading FASTA int
>temp_459|source:-*- coding: utf-8 -*-|start:3625|stop:4125
lf, read_id):self.reset()while self.next():if self.id == read_id:return self.seq
return Falsedef close(self):self.file_pointer.close()def reset(self):self.pos = 
0self.id  = Noneself.seq = Noneself.ids = []self.file_pointer.seek(0)def visuali
ze_sequence_length_distribution(self, title, dest = None, max_seq_len = None, xt
ickstep = None, ytickstep = None):import matplotlib.pyplot as pltimport matplotl
ib.gridspec as gridspecsequence_lengths = []self.reset()while self.next():if sel
f.pos % 10000 == 0 o
>temp_460|source:-*- coding: utf-8 -*-|start:931|stop:1431
ce, piece_length = 80):ticks = range(0, len(sequence), piece_length) + [len(sequ
ence)]return '\n'.join([sequence[ticks[x]:ticks[x + 1]] for x in range(0, len(ti
cks) - 1)])def close(self):self.output_file_obj.close()class ReadFasta:def __ini
t__(self, f_name):self.ids = []self.sequences = []self.fasta = SequenceSource(f_
name)while self.fasta.next():if self.fasta.pos % 1000 == 0 or self.fasta.pos == 
1:sys.stderr.write('\r[fastalib] Reading FASTA into memory: %s' % (self.fasta.po
s))sys.stderr.flush(
>temp_461|source:-*- coding: utf-8 -*-|start:2057|stop:2557
lse:self.total_seq = len([l for l in self.file_pointer.readlines() if l.startswi
th('>')])self.reset()if self.unique:self.init_unique_hash()def init_unique_hash(
self):while self.next_regular():hash = hashlib.sha1(self.seq.upper()).hexdigest(
)if hash in self.unique_hash_dict:self.unique_hash_dict[hash]['ids'].append(self
.id)self.unique_hash_dict[hash]['count'] += 1else:self.unique_hash_dict[hash] = 
{'id' : self.id,'ids': [self.id],'seq': self.seq,'count': 1}self.unique_hash_lis
t = [i[1] for i in s
>temp_462|source:-*- coding: utf-8 -*-|start:3811|stop:4311
 Noneself.ids = []self.file_pointer.seek(0)def visualize_sequence_length_distrib
ution(self, title, dest = None, max_seq_len = None, xtickstep = None, ytickstep 
= None):import matplotlib.pyplot as pltimport matplotlib.gridspec as gridspecseq
uence_lengths = []self.reset()while self.next():if self.pos % 10000 == 0 or self
.pos == 1:sys.stderr.write('\r[fastalib] Reading: %s' % (self.pos))sys.stderr.fl
ush()sequence_lengths.append(len(self.seq))self.reset()sys.stderr.write('\n')if 
not max_seq_len:max_
>temp_463|source:-*- coding: utf-8 -*-|start:4486|stop:4986
1fig = plt.figure(figsize = (16, 12))plt.rcParams.update({'axes.linewidth' : 0.9
})plt.rc('grid', color='0.50', linestyle='-', linewidth=0.1)gs = gridspec.GridSp
ec(10, 1)ax1 = plt.subplot(gs[0:8])plt.grid(True)plt.subplots_adjust(left=0.05, 
bottom = 0.03, top = 0.95, right = 0.98)plt.plot(seq_len_distribution, color = '
black', alpha = 0.3)plt.fill_between(range(0, max_seq_len + 1), seq_len_distribu
tion, y2 = 0, color = 'black', alpha = 0.15)plt.ylabel('number of sequences')plt
.xlabel('sequence le
>temp_464|source:-*- coding: utf-8 -*-|start:1247|stop:1747
e(f_name)while self.fasta.next():if self.fasta.pos % 1000 == 0 or self.fasta.pos
 == 1:sys.stderr.write('\r[fastalib] Reading FASTA into memory: %s' % (self.fast
a.pos))sys.stderr.flush()self.ids.append(self.fasta.id)self.sequences.append(sel
f.fasta.seq)sys.stderr.write('\n')def close(self):self.fasta.close()class Sequen
ceSource():def __init__(self, fasta_file_path, lazy_init = True, unique = False,
 allow_mixed_case = False):self.fasta_file_path = fasta_file_pathself.name = Non
eself.lazy_init = la
>temp_465|source:-*- coding: utf-8 -*-|start:2385|stop:2885
.unique_hash_dict[hash]['count'] += 1else:self.unique_hash_dict[hash] = {'id' : 
self.id,'ids': [self.id],'seq': self.seq,'count': 1}self.unique_hash_list = [i[1
] for i in sorted([(self.unique_hash_dict[hash]['count'], hash)\for hash in self
.unique_hash_dict], reverse = True)]self.total_unique = len(self.unique_hash_dic
t)self.reset()def next(self):if self.unique:return self.next_unique()else:return
 self.next_regular()def next_unique(self):if self.unique:if self.total_unique > 
0 and self.pos < sel
>temp_466|source:-*- coding: utf-8 -*-|start:6350|stop:6850
nter.seek(0)if self.lazy_init:self.total_quals = Noneelse:self.total_quals = len
([l for l in self.file_pointer.readlines() if l.startswith('>')])self.reset()def
 next(self):self.id = self.file_pointer.readline()[1:].strip()self.quals = Nones
elf.quals_int = Nonequalscores = ''while 1:line = self.file_pointer.readline()if
 not line:if len(qualscores):self.quals = qualscores.strip()self.quals_int = [in
t(q) for q in self.quals.split()]self.pos += 1return Trueelse:return Falseif lin
e.startswith('>'):se
>temp_467|source:-*- coding: utf-8 -*-|start:1220|stop:1720
]self.fasta = SequenceSource(f_name)while self.fasta.next():if self.fasta.pos % 
1000 == 0 or self.fasta.pos == 1:sys.stderr.write('\r[fastalib] Reading FASTA in
to memory: %s' % (self.fasta.pos))sys.stderr.flush()self.ids.append(self.fasta.i
d)self.sequences.append(self.fasta.seq)sys.stderr.write('\n')def close(self):sel
f.fasta.close()class SequenceSource():def __init__(self, fasta_file_path, lazy_i
nit = True, unique = False, allow_mixed_case = False):self.fasta_file_path = fas
ta_file_pathself.nam
>temp_468|source:-*- coding: utf-8 -*-|start:6745|stop:7245
= [int(q) for q in self.quals.split()]self.pos += 1return Trueelse:return Falsei
f line.startswith('>'):self.file_pointer.seek(self.file_pointer.tell() - len(lin
e))breakqualscores += ' ' + line.strip()self.quals = qualscores.strip()self.qual
s_int = [int(q) for q in self.quals.split()]self.pos += 1return Truedef close(se
lf):self.file_pointer.close()def reset(self):self.pos = 0self.id  = Noneself.qua
ls = Noneself.quals_int = Noneself.ids = []self.file_pointer.seek(0)if __name__ 
== '__main__':fasta 
>temp_469|source:-*- coding: utf-8 -*-|start:1670|stop:2170
se):self.fasta_file_path = fasta_file_pathself.name = Noneself.lazy_init = lazy_
initself.allow_mixed_case = allow_mixed_caseself.pos = 0self.id  = Noneself.seq 
= Noneself.ids = []self.unique = uniqueself.unique_hash_dict = {}self.unique_has
h_list = []self.unique_next_hash = 0self.file_pointer = open(self.fasta_file_pat
h)self.file_pointer.seek(0)if self.lazy_init:self.total_seq = Noneelse:self.tota
l_seq = len([l for l in self.file_pointer.readlines() if l.startswith('>')])self
.reset()if self.uniq
>temp_470|source:-*- coding: utf-8 -*-|start:3371|stop:3871
turn Trueelse:return Falseif line.startswith('>'):self.file_pointer.seek(self.fi
le_pointer.tell() - len(line))breaksequence += line.strip()self.seq = sequence i
f self.allow_mixed_case else sequence.upper()self.pos += 1return Truedef get_seq
_by_read_id(self, read_id):self.reset()while self.next():if self.id == read_id:r
eturn self.seqreturn Falsedef close(self):self.file_pointer.close()def reset(sel
f):self.pos = 0self.id  = Noneself.seq = Noneself.ids = []self.file_pointer.seek
(0)def visualize_seq
>temp_471|source:-*- coding: utf-8 -*-|start:6653|stop:7153
nter.readline()if not line:if len(qualscores):self.quals = qualscores.strip()sel
f.quals_int = [int(q) for q in self.quals.split()]self.pos += 1return Trueelse:r
eturn Falseif line.startswith('>'):self.file_pointer.seek(self.file_pointer.tell
() - len(line))breakqualscores += ' ' + line.strip()self.quals = qualscores.stri
p()self.quals_int = [int(q) for q in self.quals.split()]self.pos += 1return True
def close(self):self.file_pointer.close()def reset(self):self.pos = 0self.id  = 
Noneself.quals = Non
>temp_472|source:-*- coding: utf-8 -*-|start:568|stop:1068
e):if entry.unique and store_frequencies:self.write_id('%s|%s' % (entry.id, 'fre
quency:%d' % len(entry.ids)))else:self.write_id(entry.id)self.write_seq(entry.se
q, split)def write_id(self, id):self.output_file_obj.write('>%s\n' % id)def writ
e_seq(self, seq, split = True):if split:seq = self.split(seq)self.output_file_ob
j.write('%s\n' % seq)def split(self, sequence, piece_length = 80):ticks = range(
0, len(sequence), piece_length) + [len(sequence)]return '\n'.join([sequence[tick
s[x]:ticks[x + 1]] f
>temp_473|source:-*- coding: utf-8 -*-|start:3758|stop:4258
def reset(self):self.pos = 0self.id  = Noneself.seq = Noneself.ids = []self.file
_pointer.seek(0)def visualize_sequence_length_distribution(self, title, dest = N
one, max_seq_len = None, xtickstep = None, ytickstep = None):import matplotlib.p
yplot as pltimport matplotlib.gridspec as gridspecsequence_lengths = []self.rese
t()while self.next():if self.pos % 10000 == 0 or self.pos == 1:sys.stderr.write(
'\r[fastalib] Reading: %s' % (self.pos))sys.stderr.flush()sequence_lengths.appen
d(len(self.seq))self
>temp_474|source:-*- coding: utf-8 -*-|start:4194|stop:4694
os))sys.stderr.flush()sequence_lengths.append(len(self.seq))self.reset()sys.stde
rr.write('\n')if not max_seq_len:max_seq_len = max(sequence_lengths) + (int(max(
sequence_lengths) / 100.0) or 10)seq_len_distribution = [0] * (max_seq_len + 1)f
or l in sequence_lengths:seq_len_distribution[l] += 1fig = plt.figure(figsize = 
(16, 12))plt.rcParams.update({'axes.linewidth' : 0.9})plt.rc('grid', color='0.50
', linestyle='-', linewidth=0.1)gs = gridspec.GridSpec(10, 1)ax1 = plt.subplot(g
s[0:8])plt.grid(True
>temp_475|source:-*- coding: utf-8 -*-|start:2513|stop:3013
: 1}self.unique_hash_list = [i[1] for i in sorted([(self.unique_hash_dict[hash][
'count'], hash)\for hash in self.unique_hash_dict], reverse = True)]self.total_u
nique = len(self.unique_hash_dict)self.reset()def next(self):if self.unique:retu
rn self.next_unique()else:return self.next_regular()def next_unique(self):if sel
f.unique:if self.total_unique > 0 and self.pos < self.total_unique:hash_entry = 
self.unique_hash_dict[self.unique_hash_list[self.pos]]self.pos += 1self.seq = ha
sh_entry['seq'] if s
>temp_476|source:-*- coding: utf-8 -*-|start:3996|stop:4496
.pyplot as pltimport matplotlib.gridspec as gridspecsequence_lengths = []self.re
set()while self.next():if self.pos % 10000 == 0 or self.pos == 1:sys.stderr.writ
e('\r[fastalib] Reading: %s' % (self.pos))sys.stderr.flush()sequence_lengths.app
end(len(self.seq))self.reset()sys.stderr.write('\n')if not max_seq_len:max_seq_l
en = max(sequence_lengths) + (int(max(sequence_lengths) / 100.0) or 10)seq_len_d
istribution = [0] * (max_seq_len + 1)for l in sequence_lengths:seq_len_distribut
ion[l] += 1fig = plt
>temp_477|source:-*- coding: utf-8 -*-|start:4550|stop:5050
linewidth' : 0.9})plt.rc('grid', color='0.50', linestyle='-', linewidth=0.1)gs =
 gridspec.GridSpec(10, 1)ax1 = plt.subplot(gs[0:8])plt.grid(True)plt.subplots_ad
just(left=0.05, bottom = 0.03, top = 0.95, right = 0.98)plt.plot(seq_len_distrib
ution, color = 'black', alpha = 0.3)plt.fill_between(range(0, max_seq_len + 1), 
seq_len_distribution, y2 = 0, color = 'black', alpha = 0.15)plt.ylabel('number o
f sequences')plt.xlabel('sequence length')if xtickstep == None:xtickstep = (max_
seq_len / 50) or 1if
>temp_478|source:-*- coding: utf-8 -*-|start:2765|stop:3265
_unique()else:return self.next_regular()def next_unique(self):if self.unique:if 
self.total_unique > 0 and self.pos < self.total_unique:hash_entry = self.unique_
hash_dict[self.unique_hash_list[self.pos]]self.pos += 1self.seq = hash_entry['se
q'] if self.allow_mixed_case else hash_entry['seq'].upper()self.id  = hash_entry
['id']self.ids = hash_entry['ids']return Trueelse:return Falseelse:return Falsed
ef next_regular(self):self.seq = Noneself.id = self.file_pointer.readline()[1:].
strip()sequence = ''
>temp_479|source:-*- coding: utf-8 -*-|start:4802|stop:5302
 = 'black', alpha = 0.3)plt.fill_between(range(0, max_seq_len + 1), seq_len_dist
ribution, y2 = 0, color = 'black', alpha = 0.15)plt.ylabel('number of sequences'
)plt.xlabel('sequence length')if xtickstep == None:xtickstep = (max_seq_len / 50
) or 1if ytickstep == None:ytickstep = max(seq_len_distribution) / 20 or 1plt.xt
icks(range(xtickstep, max_seq_len + 1, xtickstep), rotation=90, size='xx-small')
plt.yticks(range(0, max(seq_len_distribution) + 1, ytickstep),[y for y in range(
0, max(seq_len_distr
>temp_480|source:-*- coding: utf-8 -*-|start:192|stop:692
ree# Software Foundation; either version 2 of the License, or (at your option)# 
any later version.## Please read the docs/COPYING file.import sysimport numpyimp
ort hashlibclass FastaOutput:def __init__(self, output_file_path):self.output_fi
le_path = output_file_pathself.output_file_obj = open(output_file_path, 'w')def 
store(self, entry, split = True, store_frequencies = True):if entry.unique and s
tore_frequencies:self.write_id('%s|%s' % (entry.id, 'frequency:%d' % len(entry.i
ds)))else:self.write
>temp_481|source:-*- coding: utf-8 -*-|start:2164|stop:2664
f.unique:self.init_unique_hash()def init_unique_hash(self):while self.next_regul
ar():hash = hashlib.sha1(self.seq.upper()).hexdigest()if hash in self.unique_has
h_dict:self.unique_hash_dict[hash]['ids'].append(self.id)self.unique_hash_dict[h
ash]['count'] += 1else:self.unique_hash_dict[hash] = {'id' : self.id,'ids': [sel
f.id],'seq': self.seq,'count': 1}self.unique_hash_list = [i[1] for i in sorted([
(self.unique_hash_dict[hash]['count'], hash)\for hash in self.unique_hash_dict],
 reverse = True)]sel
>temp_482|source:-*- coding: utf-8 -*-|start:3227|stop:3727
er.readline()[1:].strip()sequence = ''while 1:line = self.file_pointer.readline(
)if not line:if len(sequence):self.seq = sequenceself.pos += 1return Trueelse:re
turn Falseif line.startswith('>'):self.file_pointer.seek(self.file_pointer.tell(
) - len(line))breaksequence += line.strip()self.seq = sequence if self.allow_mix
ed_case else sequence.upper()self.pos += 1return Truedef get_seq_by_read_id(self
, read_id):self.reset()while self.next():if self.id == read_id:return self.seqre
turn Falsedef close(
>temp_483|source:-*- coding: utf-8 -*-|start:1285|stop:1785
lf.fasta.pos % 1000 == 0 or self.fasta.pos == 1:sys.stderr.write('\r[fastalib] R
eading FASTA into memory: %s' % (self.fasta.pos))sys.stderr.flush()self.ids.appe
nd(self.fasta.id)self.sequences.append(self.fasta.seq)sys.stderr.write('\n')def 
close(self):self.fasta.close()class SequenceSource():def __init__(self, fasta_fi
le_path, lazy_init = True, unique = False, allow_mixed_case = False):self.fasta_
file_path = fasta_file_pathself.name = Noneself.lazy_init = lazy_initself.allow_
mixed_case = allow_m
>temp_484|source:-*- coding: utf-8 -*-|start:1995|stop:2495
f.file_pointer.seek(0)if self.lazy_init:self.total_seq = Noneelse:self.total_seq
 = len([l for l in self.file_pointer.readlines() if l.startswith('>')])self.rese
t()if self.unique:self.init_unique_hash()def init_unique_hash(self):while self.n
ext_regular():hash = hashlib.sha1(self.seq.upper()).hexdigest()if hash in self.u
nique_hash_dict:self.unique_hash_dict[hash]['ids'].append(self.id)self.unique_ha
sh_dict[hash]['count'] += 1else:self.unique_hash_dict[hash] = {'id' : self.id,'i
ds': [self.id],'seq'
>temp_485|source:-*- coding: utf-8 -*-|start:3554|stop:4054
 else sequence.upper()self.pos += 1return Truedef get_seq_by_read_id(self, read_
id):self.reset()while self.next():if self.id == read_id:return self.seqreturn Fa
lsedef close(self):self.file_pointer.close()def reset(self):self.pos = 0self.id 
 = Noneself.seq = Noneself.ids = []self.file_pointer.seek(0)def visualize_sequen
ce_length_distribution(self, title, dest = None, max_seq_len = None, xtickstep =
 None, ytickstep = None):import matplotlib.pyplot as pltimport matplotlib.gridsp
ec as gridspecsequen
>temp_486|source:-*- coding: utf-8 -*-|start:876|stop:1376
tput_file_obj.write('%s\n' % seq)def split(self, sequence, piece_length = 80):ti
cks = range(0, len(sequence), piece_length) + [len(sequence)]return '\n'.join([s
equence[ticks[x]:ticks[x + 1]] for x in range(0, len(ticks) - 1)])def close(self
):self.output_file_obj.close()class ReadFasta:def __init__(self, f_name):self.id
s = []self.sequences = []self.fasta = SequenceSource(f_name)while self.fasta.nex
t():if self.fasta.pos % 1000 == 0 or self.fasta.pos == 1:sys.stderr.write('\r[fa
stalib] Reading FAST
>temp_487|source:-*- coding: utf-8 -*-|start:2685|stop:3185
self.unique_hash_dict)self.reset()def next(self):if self.unique:return self.next
_unique()else:return self.next_regular()def next_unique(self):if self.unique:if 
self.total_unique > 0 and self.pos < self.total_unique:hash_entry = self.unique_
hash_dict[self.unique_hash_list[self.pos]]self.pos += 1self.seq = hash_entry['se
q'] if self.allow_mixed_case else hash_entry['seq'].upper()self.id  = hash_entry
['id']self.ids = hash_entry['ids']return Trueelse:return Falseelse:return Falsed
ef next_regular(self
>temp_488|source:-*- coding: utf-8 -*-|start:1738|stop:2238
init = lazy_initself.allow_mixed_case = allow_mixed_caseself.pos = 0self.id  = N
oneself.seq = Noneself.ids = []self.unique = uniqueself.unique_hash_dict = {}sel
f.unique_hash_list = []self.unique_next_hash = 0self.file_pointer = open(self.fa
sta_file_path)self.file_pointer.seek(0)if self.lazy_init:self.total_seq = Noneel
se:self.total_seq = len([l for l in self.file_pointer.readlines() if l.startswit
h('>')])self.reset()if self.unique:self.init_unique_hash()def init_unique_hash(s
elf):while self.next
>temp_489|source:-*- coding: utf-8 -*-|start:1279|stop:1779
:if self.fasta.pos % 1000 == 0 or self.fasta.pos == 1:sys.stderr.write('\r[fasta
lib] Reading FASTA into memory: %s' % (self.fasta.pos))sys.stderr.flush()self.id
s.append(self.fasta.id)self.sequences.append(self.fasta.seq)sys.stderr.write('\n
')def close(self):self.fasta.close()class SequenceSource():def __init__(self, fa
sta_file_path, lazy_init = True, unique = False, allow_mixed_case = False):self.
fasta_file_path = fasta_file_pathself.name = Noneself.lazy_init = lazy_initself.
allow_mixed_case = a
>temp_490|source:-*- coding: utf-8 -*-|start:3888|stop:4388
ribution(self, title, dest = None, max_seq_len = None, xtickstep = None, ytickst
ep = None):import matplotlib.pyplot as pltimport matplotlib.gridspec as gridspec
sequence_lengths = []self.reset()while self.next():if self.pos % 10000 == 0 or s
elf.pos == 1:sys.stderr.write('\r[fastalib] Reading: %s' % (self.pos))sys.stderr
.flush()sequence_lengths.append(len(self.seq))self.reset()sys.stderr.write('\n')
if not max_seq_len:max_seq_len = max(sequence_lengths) + (int(max(sequence_lengt
hs) / 100.0) or 10)s
>temp_491|source:-*- coding: utf-8 -*-|start:2761|stop:3261
next_unique()else:return self.next_regular()def next_unique(self):if self.unique
:if self.total_unique > 0 and self.pos < self.total_unique:hash_entry = self.uni
que_hash_dict[self.unique_hash_list[self.pos]]self.pos += 1self.seq = hash_entry
['seq'] if self.allow_mixed_case else hash_entry['seq'].upper()self.id  = hash_e
ntry['id']self.ids = hash_entry['ids']return Trueelse:return Falseelse:return Fa
lsedef next_regular(self):self.seq = Noneself.id = self.file_pointer.readline()[
1:].strip()sequence 
>temp_492|source:-*- coding: utf-8 -*-|start:4854|stop:5354
x_seq_len + 1), seq_len_distribution, y2 = 0, color = 'black', alpha = 0.15)plt.
ylabel('number of sequences')plt.xlabel('sequence length')if xtickstep == None:x
tickstep = (max_seq_len / 50) or 1if ytickstep == None:ytickstep = max(seq_len_d
istribution) / 20 or 1plt.xticks(range(xtickstep, max_seq_len + 1, xtickstep), r
otation=90, size='xx-small')plt.yticks(range(0, max(seq_len_distribution) + 1, y
tickstep),[y for y in range(0, max(seq_len_distribution) + 1, ytickstep)],size='
xx-small')plt.xlim(x
>temp_493|source:-*- coding: utf-8 -*-|start:1094|stop:1594
) - 1)])def close(self):self.output_file_obj.close()class ReadFasta:def __init__
(self, f_name):self.ids = []self.sequences = []self.fasta = SequenceSource(f_nam
e)while self.fasta.next():if self.fasta.pos % 1000 == 0 or self.fasta.pos == 1:s
ys.stderr.write('\r[fastalib] Reading FASTA into memory: %s' % (self.fasta.pos))
sys.stderr.flush()self.ids.append(self.fasta.id)self.sequences.append(self.fasta
.seq)sys.stderr.write('\n')def close(self):self.fasta.close()class SequenceSourc
e():def __init__(sel
>temp_494|source:-*- coding: utf-8 -*-|start:2506|stop:3006
'count': 1}self.unique_hash_list = [i[1] for i in sorted([(self.unique_hash_dict
[hash]['count'], hash)\for hash in self.unique_hash_dict], reverse = True)]self.
total_unique = len(self.unique_hash_dict)self.reset()def next(self):if self.uniq
ue:return self.next_unique()else:return self.next_regular()def next_unique(self)
:if self.unique:if self.total_unique > 0 and self.pos < self.total_unique:hash_e
ntry = self.unique_hash_dict[self.unique_hash_list[self.pos]]self.pos += 1self.s
eq = hash_entry['seq
>temp_495|source:-*- coding: utf-8 -*-|start:6327|stop:6827
file_path)self.file_pointer.seek(0)if self.lazy_init:self.total_quals = Noneelse
:self.total_quals = len([l for l in self.file_pointer.readlines() if l.startswit
h('>')])self.reset()def next(self):self.id = self.file_pointer.readline()[1:].st
rip()self.quals = Noneself.quals_int = Nonequalscores = ''while 1:line = self.fi
le_pointer.readline()if not line:if len(qualscores):self.quals = qualscores.stri
p()self.quals_int = [int(q) for q in self.quals.split()]self.pos += 1return True
else:return Falseif 
>temp_496|source:-*- coding: utf-8 -*-|start:1059|stop:1559
x + 1]] for x in range(0, len(ticks) - 1)])def close(self):self.output_file_obj.
close()class ReadFasta:def __init__(self, f_name):self.ids = []self.sequences = 
[]self.fasta = SequenceSource(f_name)while self.fasta.next():if self.fasta.pos %
 1000 == 0 or self.fasta.pos == 1:sys.stderr.write('\r[fastalib] Reading FASTA i
nto memory: %s' % (self.fasta.pos))sys.stderr.flush()self.ids.append(self.fasta.
id)self.sequences.append(self.fasta.seq)sys.stderr.write('\n')def close(self):se
lf.fasta.close()clas
>temp_497|source:-*- coding: utf-8 -*-|start:3090|stop:3590
]self.ids = hash_entry['ids']return Trueelse:return Falseelse:return Falsedef ne
xt_regular(self):self.seq = Noneself.id = self.file_pointer.readline()[1:].strip
()sequence = ''while 1:line = self.file_pointer.readline()if not line:if len(seq
uence):self.seq = sequenceself.pos += 1return Trueelse:return Falseif line.start
swith('>'):self.file_pointer.seek(self.file_pointer.tell() - len(line))breaksequ
ence += line.strip()self.seq = sequence if self.allow_mixed_case else sequence.u
pper()self.pos += 1r
>temp_498|source:-*- coding: utf-8 -*-|start:3243|stop:3743
].strip()sequence = ''while 1:line = self.file_pointer.readline()if not line:if 
len(sequence):self.seq = sequenceself.pos += 1return Trueelse:return Falseif lin
e.startswith('>'):self.file_pointer.seek(self.file_pointer.tell() - len(line))br
eaksequence += line.strip()self.seq = sequence if self.allow_mixed_case else seq
uence.upper()self.pos += 1return Truedef get_seq_by_read_id(self, read_id):self.
reset()while self.next():if self.id == read_id:return self.seqreturn Falsedef cl
ose(self):self.file_
>temp_499|source:-*- coding: utf-8 -*-|start:6269|stop:6769
nt = Noneself.ids = []self.file_pointer = open(self.quals_file_path)self.file_po
inter.seek(0)if self.lazy_init:self.total_quals = Noneelse:self.total_quals = le
n([l for l in self.file_pointer.readlines() if l.startswith('>')])self.reset()de
f next(self):self.id = self.file_pointer.readline()[1:].strip()self.quals = None
self.quals_int = Nonequalscores = ''while 1:line = self.file_pointer.readline()i
f not line:if len(qualscores):self.quals = qualscores.strip()self.quals_int = [i
nt(q) for q in self.
>temp_500|source:-*- coding: utf-8 -*-|start:1983|stop:2483
ile_path)self.file_pointer.seek(0)if self.lazy_init:self.total_seq = Noneelse:se
lf.total_seq = len([l for l in self.file_pointer.readlines() if l.startswith('>'
)])self.reset()if self.unique:self.init_unique_hash()def init_unique_hash(self):
while self.next_regular():hash = hashlib.sha1(self.seq.upper()).hexdigest()if ha
sh in self.unique_hash_dict:self.unique_hash_dict[hash]['ids'].append(self.id)se
lf.unique_hash_dict[hash]['count'] += 1else:self.unique_hash_dict[hash] = {'id' 
: self.id,'ids': [se
>temp_501|source:-*- coding: utf-8 -*-|start:344|stop:844
numpyimport hashlibclass FastaOutput:def __init__(self, output_file_path):self.o
utput_file_path = output_file_pathself.output_file_obj = open(output_file_path, 
'w')def store(self, entry, split = True, store_frequencies = True):if entry.uniq
ue and store_frequencies:self.write_id('%s|%s' % (entry.id, 'frequency:%d' % len
(entry.ids)))else:self.write_id(entry.id)self.write_seq(entry.seq, split)def wri
te_id(self, id):self.output_file_obj.write('>%s\n' % id)def write_seq(self, seq,
 split = True):if sp
>temp_502|source:-*- coding: utf-8 -*-|start:3099|stop:3599
 = hash_entry['ids']return Trueelse:return Falseelse:return Falsedef next_regula
r(self):self.seq = Noneself.id = self.file_pointer.readline()[1:].strip()sequenc
e = ''while 1:line = self.file_pointer.readline()if not line:if len(sequence):se
lf.seq = sequenceself.pos += 1return Trueelse:return Falseif line.startswith('>'
):self.file_pointer.seek(self.file_pointer.tell() - len(line))breaksequence += l
ine.strip()self.seq = sequence if self.allow_mixed_case else sequence.upper()sel
f.pos += 1return Tru
>temp_503|source:-*- coding: utf-8 -*-|start:5376|stop:5876
q_len)plt.ylim(ymin = 0, ymax = max(seq_len_distribution) + (max(seq_len_distrib
ution) / 20.0))plt.figtext(0.5, 0.96, '%s' % (title), weight = 'black', size = '
xx-large', ha = 'center')ax1 = plt.subplot(gs[9])plt.rcParams.update({'axes.edge
color' : 20})plt.grid(False)plt.yticks([])plt.xticks([])plt.text(0.02, 0.5, 'tot
al: %s / mean: %.2f / std: %.2f / min: %s / max: %s'\% (len(sequence_lengths),nu
mpy.mean(sequence_lengths), numpy.std(sequence_lengths),\min(sequence_lengths),\
max(sequence_lengths
>temp_504|source:-*- coding: utf-8 -*-|start:6256|stop:6756
eself.quals_int = Noneself.ids = []self.file_pointer = open(self.quals_file_path
)self.file_pointer.seek(0)if self.lazy_init:self.total_quals = Noneelse:self.tot
al_quals = len([l for l in self.file_pointer.readlines() if l.startswith('>')])s
elf.reset()def next(self):self.id = self.file_pointer.readline()[1:].strip()self
.quals = Noneself.quals_int = Nonequalscores = ''while 1:line = self.file_pointe
r.readline()if not line:if len(qualscores):self.quals = qualscores.strip()self.q
uals_int = [int(q) f
>temp_505|source:-*- coding: utf-8 -*-|start:4619|stop:5119
th=0.1)gs = gridspec.GridSpec(10, 1)ax1 = plt.subplot(gs[0:8])plt.grid(True)plt.
subplots_adjust(left=0.05, bottom = 0.03, top = 0.95, right = 0.98)plt.plot(seq_
len_distribution, color = 'black', alpha = 0.3)plt.fill_between(range(0, max_seq
_len + 1), seq_len_distribution, y2 = 0, color = 'black', alpha = 0.15)plt.ylabe
l('number of sequences')plt.xlabel('sequence length')if xtickstep == None:xticks
tep = (max_seq_len / 50) or 1if ytickstep == None:ytickstep = max(seq_len_distri
bution) / 20 or 1plt
>temp_506|source:-*- coding: utf-8 -*-|start:4178|stop:4678
g: %s' % (self.pos))sys.stderr.flush()sequence_lengths.append(len(self.seq))self
.reset()sys.stderr.write('\n')if not max_seq_len:max_seq_len = max(sequence_leng
ths) + (int(max(sequence_lengths) / 100.0) or 10)seq_len_distribution = [0] * (m
ax_seq_len + 1)for l in sequence_lengths:seq_len_distribution[l] += 1fig = plt.f
igure(figsize = (16, 12))plt.rcParams.update({'axes.linewidth' : 0.9})plt.rc('gr
id', color='0.50', linestyle='-', linewidth=0.1)gs = gridspec.GridSpec(10, 1)ax1
 = plt.subplot(gs[0:
>temp_507|source:-*- coding: utf-8 -*-|start:857|stop:1357
f.split(seq)self.output_file_obj.write('%s\n' % seq)def split(self, sequence, pi
ece_length = 80):ticks = range(0, len(sequence), piece_length) + [len(sequence)]
return '\n'.join([sequence[ticks[x]:ticks[x + 1]] for x in range(0, len(ticks) -
 1)])def close(self):self.output_file_obj.close()class ReadFasta:def __init__(se
lf, f_name):self.ids = []self.sequences = []self.fasta = SequenceSource(f_name)w
hile self.fasta.next():if self.fasta.pos % 1000 == 0 or self.fasta.pos == 1:sys.
stderr.write('\r[fas
>temp_508|source:-*- coding: utf-8 -*-|start:4527|stop:5027
rcParams.update({'axes.linewidth' : 0.9})plt.rc('grid', color='0.50', linestyle=
'-', linewidth=0.1)gs = gridspec.GridSpec(10, 1)ax1 = plt.subplot(gs[0:8])plt.gr
id(True)plt.subplots_adjust(left=0.05, bottom = 0.03, top = 0.95, right = 0.98)p
lt.plot(seq_len_distribution, color = 'black', alpha = 0.3)plt.fill_between(rang
e(0, max_seq_len + 1), seq_len_distribution, y2 = 0, color = 'black', alpha = 0.
15)plt.ylabel('number of sequences')plt.xlabel('sequence length')if xtickstep ==
 None:xtickstep = (m
>temp_509|source:-*- coding: utf-8 -*-|start:112|stop:612
dify it under# the terms of the GNU General Public License as published by the F
ree# Software Foundation; either version 2 of the License, or (at your option)# 
any later version.## Please read the docs/COPYING file.import sysimport numpyimp
ort hashlibclass FastaOutput:def __init__(self, output_file_path):self.output_fi
le_path = output_file_pathself.output_file_obj = open(output_file_path, 'w')def 
store(self, entry, split = True, store_frequencies = True):if entry.unique and s
tore_frequencies:sel
>temp_510|source:-*- coding: utf-8 -*-|start:5707|stop:6207
an: %.2f / std: %.2f / min: %s / max: %s'\% (len(sequence_lengths),numpy.mean(se
quence_lengths), numpy.std(sequence_lengths),\min(sequence_lengths),\max(sequenc
e_lengths)),\va = 'center', alpha = 0.8, size = 'x-large')if dest == None:dest =
 self.fasta_file_pathtry:plt.savefig(dest + '.pdf')except:plt.savefig(dest + '.p
ng')try:plt.show()except:passreturnclass QualSource:def __init__(self, quals_fil
e_path, lazy_init = True):self.quals_file_path = quals_file_pathself.name = None
self.lazy_init = laz
>temp_511|source:-*- coding: utf-8 -*-|start:1655|stop:2155
ixed_case = False):self.fasta_file_path = fasta_file_pathself.name = Noneself.la
zy_init = lazy_initself.allow_mixed_case = allow_mixed_caseself.pos = 0self.id  
= Noneself.seq = Noneself.ids = []self.unique = uniqueself.unique_hash_dict = {}
self.unique_hash_list = []self.unique_next_hash = 0self.file_pointer = open(self
.fasta_file_path)self.file_pointer.seek(0)if self.lazy_init:self.total_seq = Non
eelse:self.total_seq = len([l for l in self.file_pointer.readlines() if l.starts
with('>')])self.rese
>temp_512|source:-*- coding: utf-8 -*-|start:4288|stop:4788
if not max_seq_len:max_seq_len = max(sequence_lengths) + (int(max(sequence_lengt
hs) / 100.0) or 10)seq_len_distribution = [0] * (max_seq_len + 1)for l in sequen
ce_lengths:seq_len_distribution[l] += 1fig = plt.figure(figsize = (16, 12))plt.r
cParams.update({'axes.linewidth' : 0.9})plt.rc('grid', color='0.50', linestyle='
-', linewidth=0.1)gs = gridspec.GridSpec(10, 1)ax1 = plt.subplot(gs[0:8])plt.gri
d(True)plt.subplots_adjust(left=0.05, bottom = 0.03, top = 0.95, right = 0.98)pl
t.plot(seq_len_distr
>temp_513|source:-*- coding: utf-8 -*-|start:2494|stop:2994
': self.seq,'count': 1}self.unique_hash_list = [i[1] for i in sorted([(self.uniq
ue_hash_dict[hash]['count'], hash)\for hash in self.unique_hash_dict], reverse =
 True)]self.total_unique = len(self.unique_hash_dict)self.reset()def next(self):
if self.unique:return self.next_unique()else:return self.next_regular()def next_
unique(self):if self.unique:if self.total_unique > 0 and self.pos < self.total_u
nique:hash_entry = self.unique_hash_dict[self.unique_hash_list[self.pos]]self.po
s += 1self.seq = has
>temp_514|source:-*- coding: utf-8 -*-|start:912|stop:1412
 split(self, sequence, piece_length = 80):ticks = range(0, len(sequence), piece_
length) + [len(sequence)]return '\n'.join([sequence[ticks[x]:ticks[x + 1]] for x
 in range(0, len(ticks) - 1)])def close(self):self.output_file_obj.close()class 
ReadFasta:def __init__(self, f_name):self.ids = []self.sequences = []self.fasta 
= SequenceSource(f_name)while self.fasta.next():if self.fasta.pos % 1000 == 0 or
 self.fasta.pos == 1:sys.stderr.write('\r[fastalib] Reading FASTA into memory: %
s' % (self.fasta.pos
>temp_515|source:-*- coding: utf-8 -*-|start:3851|stop:4351
(0)def visualize_sequence_length_distribution(self, title, dest = None, max_seq_
len = None, xtickstep = None, ytickstep = None):import matplotlib.pyplot as plti
mport matplotlib.gridspec as gridspecsequence_lengths = []self.reset()while self
.next():if self.pos % 10000 == 0 or self.pos == 1:sys.stderr.write('\r[fastalib]
 Reading: %s' % (self.pos))sys.stderr.flush()sequence_lengths.append(len(self.se
q))self.reset()sys.stderr.write('\n')if not max_seq_len:max_seq_len = max(sequen
ce_lengths) + (int(m
>temp_516|source:-*- coding: utf-8 -*-|start:6254|stop:6754
oneself.quals_int = Noneself.ids = []self.file_pointer = open(self.quals_file_pa
th)self.file_pointer.seek(0)if self.lazy_init:self.total_quals = Noneelse:self.t
otal_quals = len([l for l in self.file_pointer.readlines() if l.startswith('>')]
)self.reset()def next(self):self.id = self.file_pointer.readline()[1:].strip()se
lf.quals = Noneself.quals_int = Nonequalscores = ''while 1:line = self.file_poin
ter.readline()if not line:if len(qualscores):self.quals = qualscores.strip()self
.quals_int = [int(q)
>temp_517|source:-*- coding: utf-8 -*-|start:3347|stop:3847
 sequenceself.pos += 1return Trueelse:return Falseif line.startswith('>'):self.f
ile_pointer.seek(self.file_pointer.tell() - len(line))breaksequence += line.stri
p()self.seq = sequence if self.allow_mixed_case else sequence.upper()self.pos +=
 1return Truedef get_seq_by_read_id(self, read_id):self.reset()while self.next()
:if self.id == read_id:return self.seqreturn Falsedef close(self):self.file_poin
ter.close()def reset(self):self.pos = 0self.id  = Noneself.seq = Noneself.ids = 
[]self.file_pointer.
>temp_518|source:-*- coding: utf-8 -*-|start:4413|stop:4913
 * (max_seq_len + 1)for l in sequence_lengths:seq_len_distribution[l] += 1fig = 
plt.figure(figsize = (16, 12))plt.rcParams.update({'axes.linewidth' : 0.9})plt.r
c('grid', color='0.50', linestyle='-', linewidth=0.1)gs = gridspec.GridSpec(10, 
1)ax1 = plt.subplot(gs[0:8])plt.grid(True)plt.subplots_adjust(left=0.05, bottom 
= 0.03, top = 0.95, right = 0.98)plt.plot(seq_len_distribution, color = 'black',
 alpha = 0.3)plt.fill_between(range(0, max_seq_len + 1), seq_len_distribution, y
2 = 0, color = 'blac
>temp_519|source:-*- coding: utf-8 -*-|start:490|stop:990
ut_file_path, 'w')def store(self, entry, split = True, store_frequencies = True)
:if entry.unique and store_frequencies:self.write_id('%s|%s' % (entry.id, 'frequ
ency:%d' % len(entry.ids)))else:self.write_id(entry.id)self.write_seq(entry.seq,
 split)def write_id(self, id):self.output_file_obj.write('>%s\n' % id)def write_
seq(self, seq, split = True):if split:seq = self.split(seq)self.output_file_obj.
write('%s\n' % seq)def split(self, sequence, piece_length = 80):ticks = range(0,
 len(sequence), piec
>temp_520|source:-*- coding: utf-8 -*-|start:3389|stop:3889
rn Falseif line.startswith('>'):self.file_pointer.seek(self.file_pointer.tell() 
- len(line))breaksequence += line.strip()self.seq = sequence if self.allow_mixed
_case else sequence.upper()self.pos += 1return Truedef get_seq_by_read_id(self, 
read_id):self.reset()while self.next():if self.id == read_id:return self.seqretu
rn Falsedef close(self):self.file_pointer.close()def reset(self):self.pos = 0sel
f.id  = Noneself.seq = Noneself.ids = []self.file_pointer.seek(0)def visualize_s
equence_length_distr
>temp_521|source:-*- coding: utf-8 -*-|start:4666|stop:5166
ubplot(gs[0:8])plt.grid(True)plt.subplots_adjust(left=0.05, bottom = 0.03, top =
 0.95, right = 0.98)plt.plot(seq_len_distribution, color = 'black', alpha = 0.3)
plt.fill_between(range(0, max_seq_len + 1), seq_len_distribution, y2 = 0, color 
= 'black', alpha = 0.15)plt.ylabel('number of sequences')plt.xlabel('sequence le
ngth')if xtickstep == None:xtickstep = (max_seq_len / 50) or 1if ytickstep == No
ne:ytickstep = max(seq_len_distribution) / 20 or 1plt.xticks(range(xtickstep, ma
x_seq_len + 1, xtick
>temp_522|source:-*- coding: utf-8 -*-|start:2372|stop:2872
(self.id)self.unique_hash_dict[hash]['count'] += 1else:self.unique_hash_dict[has
h] = {'id' : self.id,'ids': [self.id],'seq': self.seq,'count': 1}self.unique_has
h_list = [i[1] for i in sorted([(self.unique_hash_dict[hash]['count'], hash)\for
 hash in self.unique_hash_dict], reverse = True)]self.total_unique = len(self.un
ique_hash_dict)self.reset()def next(self):if self.unique:return self.next_unique
()else:return self.next_regular()def next_unique(self):if self.unique:if self.to
tal_unique > 0 and s
>temp_523|source:-*- coding: utf-8 -*-|start:5494|stop:5994
'%s' % (title), weight = 'black', size = 'xx-large', ha = 'center')ax1 = plt.sub
plot(gs[9])plt.rcParams.update({'axes.edgecolor' : 20})plt.grid(False)plt.yticks
([])plt.xticks([])plt.text(0.02, 0.5, 'total: %s / mean: %.2f / std: %.2f / min:
 %s / max: %s'\% (len(sequence_lengths),numpy.mean(sequence_lengths), numpy.std(
sequence_lengths),\min(sequence_lengths),\max(sequence_lengths)),\va = 'center',
 alpha = 0.8, size = 'x-large')if dest == None:dest = self.fasta_file_pathtry:pl
t.savefig(dest + '.p
>temp_524|source:-*- coding: utf-8 -*-|start:6818|stop:7318
 Falseif line.startswith('>'):self.file_pointer.seek(self.file_pointer.tell() - 
len(line))breakqualscores += ' ' + line.strip()self.quals = qualscores.strip()se
lf.quals_int = [int(q) for q in self.quals.split()]self.pos += 1return Truedef c
lose(self):self.file_pointer.close()def reset(self):self.pos = 0self.id  = Nones
elf.quals = Noneself.quals_int = Noneself.ids = []self.file_pointer.seek(0)if __
name__ == '__main__':fasta = SequenceSource(sys.argv[1])fasta.visualize_sequence
_length_distribution
>temp_525|source:-*- coding: utf-8 -*-|start:5162|stop:5662
tickstep), rotation=90, size='xx-small')plt.yticks(range(0, max(seq_len_distribu
tion) + 1, ytickstep),[y for y in range(0, max(seq_len_distribution) + 1, yticks
tep)],size='xx-small')plt.xlim(xmin = 0, xmax = max_seq_len)plt.ylim(ymin = 0, y
max = max(seq_len_distribution) + (max(seq_len_distribution) / 20.0))plt.figtext
(0.5, 0.96, '%s' % (title), weight = 'black', size = 'xx-large', ha = 'center')a
x1 = plt.subplot(gs[9])plt.rcParams.update({'axes.edgecolor' : 20})plt.grid(Fals
e)plt.yticks([])plt.
>temp_526|source:-*- coding: utf-8 -*-|start:4688|stop:5188
d(True)plt.subplots_adjust(left=0.05, bottom = 0.03, top = 0.95, right = 0.98)pl
t.plot(seq_len_distribution, color = 'black', alpha = 0.3)plt.fill_between(range
(0, max_seq_len + 1), seq_len_distribution, y2 = 0, color = 'black', alpha = 0.1
5)plt.ylabel('number of sequences')plt.xlabel('sequence length')if xtickstep == 
None:xtickstep = (max_seq_len / 50) or 1if ytickstep == None:ytickstep = max(seq
_len_distribution) / 20 or 1plt.xticks(range(xtickstep, max_seq_len + 1, xtickst
ep), rotation=90, si
>temp_527|source:-*- coding: utf-8 -*-|start:5197|stop:5697
all')plt.yticks(range(0, max(seq_len_distribution) + 1, ytickstep),[y for y in r
ange(0, max(seq_len_distribution) + 1, ytickstep)],size='xx-small')plt.xlim(xmin
 = 0, xmax = max_seq_len)plt.ylim(ymin = 0, ymax = max(seq_len_distribution) + (
max(seq_len_distribution) / 20.0))plt.figtext(0.5, 0.96, '%s' % (title), weight 
= 'black', size = 'xx-large', ha = 'center')ax1 = plt.subplot(gs[9])plt.rcParams
.update({'axes.edgecolor' : 20})plt.grid(False)plt.yticks([])plt.xticks([])plt.t
ext(0.02, 0.5, 'tota
>temp_528|source:-*- coding: utf-8 -*-|start:1515|stop:2015
('\n')def close(self):self.fasta.close()class SequenceSource():def __init__(self
, fasta_file_path, lazy_init = True, unique = False, allow_mixed_case = False):s
elf.fasta_file_path = fasta_file_pathself.name = Noneself.lazy_init = lazy_inits
elf.allow_mixed_case = allow_mixed_caseself.pos = 0self.id  = Noneself.seq = Non
eself.ids = []self.unique = uniqueself.unique_hash_dict = {}self.unique_hash_lis
t = []self.unique_next_hash = 0self.file_pointer = open(self.fasta_file_path)sel
f.file_pointer.seek(
>temp_529|source:-*- coding: utf-8 -*-|start:49|stop:549
his program is free software; you can redistribute it and/or modify it under# th
e terms of the GNU General Public License as published by the Free# Software Fou
ndation; either version 2 of the License, or (at your option)# any later version
.## Please read the docs/COPYING file.import sysimport numpyimport hashlibclass 
FastaOutput:def __init__(self, output_file_path):self.output_file_path = output_
file_pathself.output_file_obj = open(output_file_path, 'w')def store(self, entry
, split = True, stor
>temp_530|source:-*- coding: utf-8 -*-|start:1796|stop:2296
lf.pos = 0self.id  = Noneself.seq = Noneself.ids = []self.unique = uniqueself.un
ique_hash_dict = {}self.unique_hash_list = []self.unique_next_hash = 0self.file_
pointer = open(self.fasta_file_path)self.file_pointer.seek(0)if self.lazy_init:s
elf.total_seq = Noneelse:self.total_seq = len([l for l in self.file_pointer.read
lines() if l.startswith('>')])self.reset()if self.unique:self.init_unique_hash()
def init_unique_hash(self):while self.next_regular():hash = hashlib.sha1(self.se
q.upper()).hexdigest
>temp_531|source:-*- coding: utf-8 -*-|start:5495|stop:5995
%s' % (title), weight = 'black', size = 'xx-large', ha = 'center')ax1 = plt.subp
lot(gs[9])plt.rcParams.update({'axes.edgecolor' : 20})plt.grid(False)plt.yticks(
[])plt.xticks([])plt.text(0.02, 0.5, 'total: %s / mean: %.2f / std: %.2f / min: 
%s / max: %s'\% (len(sequence_lengths),numpy.mean(sequence_lengths), numpy.std(s
equence_lengths),\min(sequence_lengths),\max(sequence_lengths)),\va = 'center', 
alpha = 0.8, size = 'x-large')if dest == None:dest = self.fasta_file_pathtry:plt
.savefig(dest + '.pd
>temp_532|source:-*- coding: utf-8 -*-|start:6482|stop:6982
tswith('>')])self.reset()def next(self):self.id = self.file_pointer.readline()[1
:].strip()self.quals = Noneself.quals_int = Nonequalscores = ''while 1:line = se
lf.file_pointer.readline()if not line:if len(qualscores):self.quals = qualscores
.strip()self.quals_int = [int(q) for q in self.quals.split()]self.pos += 1return
 Trueelse:return Falseif line.startswith('>'):self.file_pointer.seek(self.file_p
ointer.tell() - len(line))breakqualscores += ' ' + line.strip()self.quals = qual
scores.strip()self.q
>temp_533|source:-*- coding: utf-8 -*-|start:1780|stop:2280
low_mixed_caseself.pos = 0self.id  = Noneself.seq = Noneself.ids = []self.unique
 = uniqueself.unique_hash_dict = {}self.unique_hash_list = []self.unique_next_ha
sh = 0self.file_pointer = open(self.fasta_file_path)self.file_pointer.seek(0)if 
self.lazy_init:self.total_seq = Noneelse:self.total_seq = len([l for l in self.f
ile_pointer.readlines() if l.startswith('>')])self.reset()if self.unique:self.in
it_unique_hash()def init_unique_hash(self):while self.next_regular():hash = hash
lib.sha1(self.seq.up
>temp_534|source:-*- coding: utf-8 -*-|start:141|stop:641
he GNU General Public License as published by the Free# Software Foundation; eit
her version 2 of the License, or (at your option)# any later version.## Please r
ead the docs/COPYING file.import sysimport numpyimport hashlibclass FastaOutput:
def __init__(self, output_file_path):self.output_file_path = output_file_pathsel
f.output_file_obj = open(output_file_path, 'w')def store(self, entry, split = Tr
ue, store_frequencies = True):if entry.unique and store_frequencies:self.write_i
d('%s|%s' % (entry.i
>temp_535|source:-*- coding: utf-8 -*-|start:4979|stop:5479
ence length')if xtickstep == None:xtickstep = (max_seq_len / 50) or 1if ytickste
p == None:ytickstep = max(seq_len_distribution) / 20 or 1plt.xticks(range(xticks
tep, max_seq_len + 1, xtickstep), rotation=90, size='xx-small')plt.yticks(range(
0, max(seq_len_distribution) + 1, ytickstep),[y for y in range(0, max(seq_len_di
stribution) + 1, ytickstep)],size='xx-small')plt.xlim(xmin = 0, xmax = max_seq_l
en)plt.ylim(ymin = 0, ymax = max(seq_len_distribution) + (max(seq_len_distributi
on) / 20.0))plt.figt
>temp_536|source:-*- coding: utf-8 -*-|start:2840|stop:3340
e:if self.total_unique > 0 and self.pos < self.total_unique:hash_entry = self.un
ique_hash_dict[self.unique_hash_list[self.pos]]self.pos += 1self.seq = hash_entr
y['seq'] if self.allow_mixed_case else hash_entry['seq'].upper()self.id  = hash_
entry['id']self.ids = hash_entry['ids']return Trueelse:return Falseelse:return F
alsedef next_regular(self):self.seq = Noneself.id = self.file_pointer.readline()
[1:].strip()sequence = ''while 1:line = self.file_pointer.readline()if not line:
if len(sequence):sel
>temp_537|source:-*- coding: utf-8 -*-|start:4634|stop:5134
dspec.GridSpec(10, 1)ax1 = plt.subplot(gs[0:8])plt.grid(True)plt.subplots_adjust
(left=0.05, bottom = 0.03, top = 0.95, right = 0.98)plt.plot(seq_len_distributio
n, color = 'black', alpha = 0.3)plt.fill_between(range(0, max_seq_len + 1), seq_
len_distribution, y2 = 0, color = 'black', alpha = 0.15)plt.ylabel('number of se
quences')plt.xlabel('sequence length')if xtickstep == None:xtickstep = (max_seq_
len / 50) or 1if ytickstep == None:ytickstep = max(seq_len_distribution) / 20 or
 1plt.xticks(range(x
>temp_538|source:-*- coding: utf-8 -*-|start:6575|stop:7075
f.quals = Noneself.quals_int = Nonequalscores = ''while 1:line = self.file_point
er.readline()if not line:if len(qualscores):self.quals = qualscores.strip()self.
quals_int = [int(q) for q in self.quals.split()]self.pos += 1return Trueelse:ret
urn Falseif line.startswith('>'):self.file_pointer.seek(self.file_pointer.tell()
 - len(line))breakqualscores += ' ' + line.strip()self.quals = qualscores.strip(
)self.quals_int = [int(q) for q in self.quals.split()]self.pos += 1return Truede
f close(self):self.f
>temp_539|source:-*- coding: utf-8 -*-|start:3098|stop:3598
s = hash_entry['ids']return Trueelse:return Falseelse:return Falsedef next_regul
ar(self):self.seq = Noneself.id = self.file_pointer.readline()[1:].strip()sequen
ce = ''while 1:line = self.file_pointer.readline()if not line:if len(sequence):s
elf.seq = sequenceself.pos += 1return Trueelse:return Falseif line.startswith('>
'):self.file_pointer.seek(self.file_pointer.tell() - len(line))breaksequence += 
line.strip()self.seq = sequence if self.allow_mixed_case else sequence.upper()se
lf.pos += 1return Tr
>temp_540|source:-*- coding: utf-8 -*-|start:2756|stop:3256
self.next_unique()else:return self.next_regular()def next_unique(self):if self.u
nique:if self.total_unique > 0 and self.pos < self.total_unique:hash_entry = sel
f.unique_hash_dict[self.unique_hash_list[self.pos]]self.pos += 1self.seq = hash_
entry['seq'] if self.allow_mixed_case else hash_entry['seq'].upper()self.id  = h
ash_entry['id']self.ids = hash_entry['ids']return Trueelse:return Falseelse:retu
rn Falsedef next_regular(self):self.seq = Noneself.id = self.file_pointer.readli
ne()[1:].strip()sequ
>temp_541|source:-*- coding: utf-8 -*-|start:4282|stop:4782
('\n')if not max_seq_len:max_seq_len = max(sequence_lengths) + (int(max(sequence
_lengths) / 100.0) or 10)seq_len_distribution = [0] * (max_seq_len + 1)for l in 
sequence_lengths:seq_len_distribution[l] += 1fig = plt.figure(figsize = (16, 12)
)plt.rcParams.update({'axes.linewidth' : 0.9})plt.rc('grid', color='0.50', lines
tyle='-', linewidth=0.1)gs = gridspec.GridSpec(10, 1)ax1 = plt.subplot(gs[0:8])p
lt.grid(True)plt.subplots_adjust(left=0.05, bottom = 0.03, top = 0.95, right = 0
.98)plt.plot(seq_len
>temp_542|source:-*- coding: utf-8 -*-|start:144|stop:644
GNU General Public License as published by the Free# Software Foundation; either
 version 2 of the License, or (at your option)# any later version.## Please read
 the docs/COPYING file.import sysimport numpyimport hashlibclass FastaOutput:def
 __init__(self, output_file_path):self.output_file_path = output_file_pathself.o
utput_file_obj = open(output_file_path, 'w')def store(self, entry, split = True,
 store_frequencies = True):if entry.unique and store_frequencies:self.write_id('
%s|%s' % (entry.id, 
>temp_543|source:-*- coding: utf-8 -*-|start:6317|stop:6817
elf.quals_file_path)self.file_pointer.seek(0)if self.lazy_init:self.total_quals 
= Noneelse:self.total_quals = len([l for l in self.file_pointer.readlines() if l
.startswith('>')])self.reset()def next(self):self.id = self.file_pointer.readlin
e()[1:].strip()self.quals = Noneself.quals_int = Nonequalscores = ''while 1:line
 = self.file_pointer.readline()if not line:if len(qualscores):self.quals = quals
cores.strip()self.quals_int = [int(q) for q in self.quals.split()]self.pos += 1r
eturn Trueelse:retur
>temp_544|source:-*- coding: utf-8 -*-|start:6344|stop:6844
le_pointer.seek(0)if self.lazy_init:self.total_quals = Noneelse:self.total_quals
 = len([l for l in self.file_pointer.readlines() if l.startswith('>')])self.rese
t()def next(self):self.id = self.file_pointer.readline()[1:].strip()self.quals =
 Noneself.quals_int = Nonequalscores = ''while 1:line = self.file_pointer.readli
ne()if not line:if len(qualscores):self.quals = qualscores.strip()self.quals_int
 = [int(q) for q in self.quals.split()]self.pos += 1return Trueelse:return False
if line.startswith('
>temp_545|source:-*- coding: utf-8 -*-|start:50|stop:550
is program is free software; you can redistribute it and/or modify it under# the
 terms of the GNU General Public License as published by the Free# Software Foun
dation; either version 2 of the License, or (at your option)# any later version.
## Please read the docs/COPYING file.import sysimport numpyimport hashlibclass F
astaOutput:def __init__(self, output_file_path):self.output_file_path = output_f
ile_pathself.output_file_obj = open(output_file_path, 'w')def store(self, entry,
 split = True, store
>temp_546|source:-*- coding: utf-8 -*-|start:425|stop:925
tput_file_path = output_file_pathself.output_file_obj = open(output_file_path, '
w')def store(self, entry, split = True, store_frequencies = True):if entry.uniqu
e and store_frequencies:self.write_id('%s|%s' % (entry.id, 'frequency:%d' % len(
entry.ids)))else:self.write_id(entry.id)self.write_seq(entry.seq, split)def writ
e_id(self, id):self.output_file_obj.write('>%s\n' % id)def write_seq(self, seq, 
split = True):if split:seq = self.split(seq)self.output_file_obj.write('%s\n' % 
seq)def split(self, 
>temp_547|source:-*- coding: utf-8 -*-|start:808|stop:1308
e_seq(self, seq, split = True):if split:seq = self.split(seq)self.output_file_ob
j.write('%s\n' % seq)def split(self, sequence, piece_length = 80):ticks = range(
0, len(sequence), piece_length) + [len(sequence)]return '\n'.join([sequence[tick
s[x]:ticks[x + 1]] for x in range(0, len(ticks) - 1)])def close(self):self.outpu
t_file_obj.close()class ReadFasta:def __init__(self, f_name):self.ids = []self.s
equences = []self.fasta = SequenceSource(f_name)while self.fasta.next():if self.
fasta.pos % 1000 == 
>temp_548|source:-*- coding: utf-8 -*-|start:6456|stop:6956
nter.readlines() if l.startswith('>')])self.reset()def next(self):self.id = self
.file_pointer.readline()[1:].strip()self.quals = Noneself.quals_int = Nonequalsc
ores = ''while 1:line = self.file_pointer.readline()if not line:if len(qualscore
s):self.quals = qualscores.strip()self.quals_int = [int(q) for q in self.quals.s
plit()]self.pos += 1return Trueelse:return Falseif line.startswith('>'):self.fil
e_pointer.seek(self.file_pointer.tell() - len(line))breakqualscores += ' ' + lin
e.strip()self.quals 
>temp_549|source:-*- coding: utf-8 -*-|start:5400|stop:5900
 ymax = max(seq_len_distribution) + (max(seq_len_distribution) / 20.0))plt.figte
xt(0.5, 0.96, '%s' % (title), weight = 'black', size = 'xx-large', ha = 'center'
)ax1 = plt.subplot(gs[9])plt.rcParams.update({'axes.edgecolor' : 20})plt.grid(Fa
lse)plt.yticks([])plt.xticks([])plt.text(0.02, 0.5, 'total: %s / mean: %.2f / st
d: %.2f / min: %s / max: %s'\% (len(sequence_lengths),numpy.mean(sequence_length
s), numpy.std(sequence_lengths),\min(sequence_lengths),\max(sequence_lengths)),\
va = 'center', alpha
>temp_550|source:-*- coding: utf-8 -*-|start:3061|stop:3561
r()self.id  = hash_entry['id']self.ids = hash_entry['ids']return Trueelse:return
 Falseelse:return Falsedef next_regular(self):self.seq = Noneself.id = self.file
_pointer.readline()[1:].strip()sequence = ''while 1:line = self.file_pointer.rea
dline()if not line:if len(sequence):self.seq = sequenceself.pos += 1return Truee
lse:return Falseif line.startswith('>'):self.file_pointer.seek(self.file_pointer
.tell() - len(line))breaksequence += line.strip()self.seq = sequence if self.all
ow_mixed_case else s
>temp_551|source:-*- coding: utf-8 -*-|start:47|stop:547
 This program is free software; you can redistribute it and/or modify it under# 
the terms of the GNU General Public License as published by the Free# Software F
oundation; either version 2 of the License, or (at your option)# any later versi
on.## Please read the docs/COPYING file.import sysimport numpyimport hashlibclas
s FastaOutput:def __init__(self, output_file_path):self.output_file_path = outpu
t_file_pathself.output_file_obj = open(output_file_path, 'w')def store(self, ent
ry, split = True, st
>temp_552|source:-*- coding: utf-8 -*-|start:4564|stop:5064
.9})plt.rc('grid', color='0.50', linestyle='-', linewidth=0.1)gs = gridspec.Grid
Spec(10, 1)ax1 = plt.subplot(gs[0:8])plt.grid(True)plt.subplots_adjust(left=0.05
, bottom = 0.03, top = 0.95, right = 0.98)plt.plot(seq_len_distribution, color =
 'black', alpha = 0.3)plt.fill_between(range(0, max_seq_len + 1), seq_len_distri
bution, y2 = 0, color = 'black', alpha = 0.15)plt.ylabel('number of sequences')p
lt.xlabel('sequence length')if xtickstep == None:xtickstep = (max_seq_len / 50) 
or 1if ytickstep == 
>temp_553|source:-*- coding: utf-8 -*-|start:6114|stop:6614
 lazy_init = True):self.quals_file_path = quals_file_pathself.name = Noneself.la
zy_init = lazy_initself.pos = 0self.id  = Noneself.quals = Noneself.quals_int = 
Noneself.ids = []self.file_pointer = open(self.quals_file_path)self.file_pointer
.seek(0)if self.lazy_init:self.total_quals = Noneelse:self.total_quals = len([l 
for l in self.file_pointer.readlines() if l.startswith('>')])self.reset()def nex
t(self):self.id = self.file_pointer.readline()[1:].strip()self.quals = Noneself.
quals_int = Nonequal
>temp_554|source:-*- coding: utf-8 -*-|start:255|stop:755
at your option)# any later version.## Please read the docs/COPYING file.import s
ysimport numpyimport hashlibclass FastaOutput:def __init__(self, output_file_pat
h):self.output_file_path = output_file_pathself.output_file_obj = open(output_fi
le_path, 'w')def store(self, entry, split = True, store_frequencies = True):if e
ntry.unique and store_frequencies:self.write_id('%s|%s' % (entry.id, 'frequency:
%d' % len(entry.ids)))else:self.write_id(entry.id)self.write_seq(entry.seq, spli
t)def write_id(self,
>temp_555|source:-*- coding: utf-8 -*-|start:1301|stop:1801
000 == 0 or self.fasta.pos == 1:sys.stderr.write('\r[fastalib] Reading FASTA int
o memory: %s' % (self.fasta.pos))sys.stderr.flush()self.ids.append(self.fasta.id
)self.sequences.append(self.fasta.seq)sys.stderr.write('\n')def close(self):self
.fasta.close()class SequenceSource():def __init__(self, fasta_file_path, lazy_in
it = True, unique = False, allow_mixed_case = False):self.fasta_file_path = fast
a_file_pathself.name = Noneself.lazy_init = lazy_initself.allow_mixed_case = all
ow_mixed_caseself.po
>temp_556|source:-*- coding: utf-8 -*-|start:1371|stop:1871
 FASTA into memory: %s' % (self.fasta.pos))sys.stderr.flush()self.ids.append(sel
f.fasta.id)self.sequences.append(self.fasta.seq)sys.stderr.write('\n')def close(
self):self.fasta.close()class SequenceSource():def __init__(self, fasta_file_pat
h, lazy_init = True, unique = False, allow_mixed_case = False):self.fasta_file_p
ath = fasta_file_pathself.name = Noneself.lazy_init = lazy_initself.allow_mixed_
case = allow_mixed_caseself.pos = 0self.id  = Noneself.seq = Noneself.ids = []se
lf.unique = uniquese
>temp_557|source:-*- coding: utf-8 -*-|start:6566|stop:7066
trip()self.quals = Noneself.quals_int = Nonequalscores = ''while 1:line = self.f
ile_pointer.readline()if not line:if len(qualscores):self.quals = qualscores.str
ip()self.quals_int = [int(q) for q in self.quals.split()]self.pos += 1return Tru
eelse:return Falseif line.startswith('>'):self.file_pointer.seek(self.file_point
er.tell() - len(line))breakqualscores += ' ' + line.strip()self.quals = qualscor
es.strip()self.quals_int = [int(q) for q in self.quals.split()]self.pos += 1retu
rn Truedef close(sel
>temp_558|source:-*- coding: utf-8 -*-|start:3841|stop:4341
inter.seek(0)def visualize_sequence_length_distribution(self, title, dest = None
, max_seq_len = None, xtickstep = None, ytickstep = None):import matplotlib.pypl
ot as pltimport matplotlib.gridspec as gridspecsequence_lengths = []self.reset()
while self.next():if self.pos % 10000 == 0 or self.pos == 1:sys.stderr.write('\r
[fastalib] Reading: %s' % (self.pos))sys.stderr.flush()sequence_lengths.append(l
en(self.seq))self.reset()sys.stderr.write('\n')if not max_seq_len:max_seq_len = 
max(sequence_lengths
>temp_559|source:-*- coding: utf-8 -*-|start:3768|stop:4268
self):self.pos = 0self.id  = Noneself.seq = Noneself.ids = []self.file_pointer.s
eek(0)def visualize_sequence_length_distribution(self, title, dest = None, max_s
eq_len = None, xtickstep = None, ytickstep = None):import matplotlib.pyplot as p
ltimport matplotlib.gridspec as gridspecsequence_lengths = []self.reset()while s
elf.next():if self.pos % 10000 == 0 or self.pos == 1:sys.stderr.write('\r[fastal
ib] Reading: %s' % (self.pos))sys.stderr.flush()sequence_lengths.append(len(self
.seq))self.reset()sy
>temp_560|source:-*- coding: utf-8 -*-|start:5217|stop:5717
e(0, max(seq_len_distribution) + 1, ytickstep),[y for y in range(0, max(seq_len_
distribution) + 1, ytickstep)],size='xx-small')plt.xlim(xmin = 0, xmax = max_seq
_len)plt.ylim(ymin = 0, ymax = max(seq_len_distribution) + (max(seq_len_distribu
tion) / 20.0))plt.figtext(0.5, 0.96, '%s' % (title), weight = 'black', size = 'x
x-large', ha = 'center')ax1 = plt.subplot(gs[9])plt.rcParams.update({'axes.edgec
olor' : 20})plt.grid(False)plt.yticks([])plt.xticks([])plt.text(0.02, 0.5, 'tota
l: %s / mean: %.2f /
>temp_561|source:-*- coding: utf-8 -*-|start:1126|stop:1626
put_file_obj.close()class ReadFasta:def __init__(self, f_name):self.ids = []self
.sequences = []self.fasta = SequenceSource(f_name)while self.fasta.next():if sel
f.fasta.pos % 1000 == 0 or self.fasta.pos == 1:sys.stderr.write('\r[fastalib] Re
ading FASTA into memory: %s' % (self.fasta.pos))sys.stderr.flush()self.ids.appen
d(self.fasta.id)self.sequences.append(self.fasta.seq)sys.stderr.write('\n')def c
lose(self):self.fasta.close()class SequenceSource():def __init__(self, fasta_fil
e_path, lazy_init = 
>temp_562|source:-*- coding: utf-8 -*-|start:6651|stop:7151
ointer.readline()if not line:if len(qualscores):self.quals = qualscores.strip()s
elf.quals_int = [int(q) for q in self.quals.split()]self.pos += 1return Trueelse
:return Falseif line.startswith('>'):self.file_pointer.seek(self.file_pointer.te
ll() - len(line))breakqualscores += ' ' + line.strip()self.quals = qualscores.st
rip()self.quals_int = [int(q) for q in self.quals.split()]self.pos += 1return Tr
uedef close(self):self.file_pointer.close()def reset(self):self.pos = 0self.id  
= Noneself.quals = N
>temp_563|source:-*- coding: utf-8 -*-|start:4267|stop:4767
ys.stderr.write('\n')if not max_seq_len:max_seq_len = max(sequence_lengths) + (i
nt(max(sequence_lengths) / 100.0) or 10)seq_len_distribution = [0] * (max_seq_le
n + 1)for l in sequence_lengths:seq_len_distribution[l] += 1fig = plt.figure(fig
size = (16, 12))plt.rcParams.update({'axes.linewidth' : 0.9})plt.rc('grid', colo
r='0.50', linestyle='-', linewidth=0.1)gs = gridspec.GridSpec(10, 1)ax1 = plt.su
bplot(gs[0:8])plt.grid(True)plt.subplots_adjust(left=0.05, bottom = 0.03, top = 
0.95, right = 0.98)p
>temp_564|source:-*- coding: utf-8 -*-|start:4588|stop:5088
='0.50', linestyle='-', linewidth=0.1)gs = gridspec.GridSpec(10, 1)ax1 = plt.sub
plot(gs[0:8])plt.grid(True)plt.subplots_adjust(left=0.05, bottom = 0.03, top = 0
.95, right = 0.98)plt.plot(seq_len_distribution, color = 'black', alpha = 0.3)pl
t.fill_between(range(0, max_seq_len + 1), seq_len_distribution, y2 = 0, color = 
'black', alpha = 0.15)plt.ylabel('number of sequences')plt.xlabel('sequence leng
th')if xtickstep == None:xtickstep = (max_seq_len / 50) or 1if ytickstep == None
:ytickstep = max(seq
>temp_565|source:-*- coding: utf-8 -*-|start:5456|stop:5956
ution) / 20.0))plt.figtext(0.5, 0.96, '%s' % (title), weight = 'black', size = '
xx-large', ha = 'center')ax1 = plt.subplot(gs[9])plt.rcParams.update({'axes.edge
color' : 20})plt.grid(False)plt.yticks([])plt.xticks([])plt.text(0.02, 0.5, 'tot
al: %s / mean: %.2f / std: %.2f / min: %s / max: %s'\% (len(sequence_lengths),nu
mpy.mean(sequence_lengths), numpy.std(sequence_lengths),\min(sequence_lengths),\
max(sequence_lengths)),\va = 'center', alpha = 0.8, size = 'x-large')if dest == 
None:dest = self.fas
>temp_566|source:-*- coding: utf-8 -*-|start:1722|stop:2222
= Noneself.lazy_init = lazy_initself.allow_mixed_case = allow_mixed_caseself.pos
 = 0self.id  = Noneself.seq = Noneself.ids = []self.unique = uniqueself.unique_h
ash_dict = {}self.unique_hash_list = []self.unique_next_hash = 0self.file_pointe
r = open(self.fasta_file_path)self.file_pointer.seek(0)if self.lazy_init:self.to
tal_seq = Noneelse:self.total_seq = len([l for l in self.file_pointer.readlines(
) if l.startswith('>')])self.reset()if self.unique:self.init_unique_hash()def in
it_unique_hash(self)
>temp_567|source:-*- coding: utf-8 -*-|start:2802|stop:3302
r()def next_unique(self):if self.unique:if self.total_unique > 0 and self.pos < 
self.total_unique:hash_entry = self.unique_hash_dict[self.unique_hash_list[self.
pos]]self.pos += 1self.seq = hash_entry['seq'] if self.allow_mixed_case else has
h_entry['seq'].upper()self.id  = hash_entry['id']self.ids = hash_entry['ids']ret
urn Trueelse:return Falseelse:return Falsedef next_regular(self):self.seq = None
self.id = self.file_pointer.readline()[1:].strip()sequence = ''while 1:line = se
lf.file_pointer.read
>temp_568|source:-*- coding: utf-8 -*-|start:5282|stop:5782
0, max(seq_len_distribution) + 1, ytickstep)],size='xx-small')plt.xlim(xmin = 0,
 xmax = max_seq_len)plt.ylim(ymin = 0, ymax = max(seq_len_distribution) + (max(s
eq_len_distribution) / 20.0))plt.figtext(0.5, 0.96, '%s' % (title), weight = 'bl
ack', size = 'xx-large', ha = 'center')ax1 = plt.subplot(gs[9])plt.rcParams.upda
te({'axes.edgecolor' : 20})plt.grid(False)plt.yticks([])plt.xticks([])plt.text(0
.02, 0.5, 'total: %s / mean: %.2f / std: %.2f / min: %s / max: %s'\% (len(sequen
ce_lengths),numpy.me
>temp_569|source:-*- coding: utf-8 -*-|start:2716|stop:3216
t()def next(self):if self.unique:return self.next_unique()else:return self.next_
regular()def next_unique(self):if self.unique:if self.total_unique > 0 and self.
pos < self.total_unique:hash_entry = self.unique_hash_dict[self.unique_hash_list
[self.pos]]self.pos += 1self.seq = hash_entry['seq'] if self.allow_mixed_case el
se hash_entry['seq'].upper()self.id  = hash_entry['id']self.ids = hash_entry['id
s']return Trueelse:return Falseelse:return Falsedef next_regular(self):self.seq 
= Noneself.id = self
>temp_570|source:-*- coding: utf-8 -*-|start:6213|stop:6713
self.pos = 0self.id  = Noneself.quals = Noneself.quals_int = Noneself.ids = []se
lf.file_pointer = open(self.quals_file_path)self.file_pointer.seek(0)if self.laz
y_init:self.total_quals = Noneelse:self.total_quals = len([l for l in self.file_
pointer.readlines() if l.startswith('>')])self.reset()def next(self):self.id = s
elf.file_pointer.readline()[1:].strip()self.quals = Noneself.quals_int = Nonequa
lscores = ''while 1:line = self.file_pointer.readline()if not line:if len(qualsc
ores):self.quals = q
>temp_571|source:-*- coding: utf-8 -*-|start:5855|stop:6355
\max(sequence_lengths)),\va = 'center', alpha = 0.8, size = 'x-large')if dest ==
 None:dest = self.fasta_file_pathtry:plt.savefig(dest + '.pdf')except:plt.savefi
g(dest + '.png')try:plt.show()except:passreturnclass QualSource:def __init__(sel
f, quals_file_path, lazy_init = True):self.quals_file_path = quals_file_pathself
.name = Noneself.lazy_init = lazy_initself.pos = 0self.id  = Noneself.quals = No
neself.quals_int = Noneself.ids = []self.file_pointer = open(self.quals_file_pat
h)self.file_pointer.
>temp_572|source:-*- coding: utf-8 -*-|start:186|stop:686
 the Free# Software Foundation; either version 2 of the License, or (at your opt
ion)# any later version.## Please read the docs/COPYING file.import sysimport nu
mpyimport hashlibclass FastaOutput:def __init__(self, output_file_path):self.out
put_file_path = output_file_pathself.output_file_obj = open(output_file_path, 'w
')def store(self, entry, split = True, store_frequencies = True):if entry.unique
 and store_frequencies:self.write_id('%s|%s' % (entry.id, 'frequency:%d' % len(e
ntry.ids)))else:self
>temp_573|source:-*- coding: utf-8 -*-|start:5310|stop:5810
 + 1, ytickstep)],size='xx-small')plt.xlim(xmin = 0, xmax = max_seq_len)plt.ylim
(ymin = 0, ymax = max(seq_len_distribution) + (max(seq_len_distribution) / 20.0)
)plt.figtext(0.5, 0.96, '%s' % (title), weight = 'black', size = 'xx-large', ha 
= 'center')ax1 = plt.subplot(gs[9])plt.rcParams.update({'axes.edgecolor' : 20})p
lt.grid(False)plt.yticks([])plt.xticks([])plt.text(0.02, 0.5, 'total: %s / mean:
 %.2f / std: %.2f / min: %s / max: %s'\% (len(sequence_lengths),numpy.mean(seque
nce_lengths), numpy.
>temp_574|source:-*- coding: utf-8 -*-|start:299|stop:799
 read the docs/COPYING file.import sysimport numpyimport hashlibclass FastaOutpu
t:def __init__(self, output_file_path):self.output_file_path = output_file_paths
elf.output_file_obj = open(output_file_path, 'w')def store(self, entry, split = 
True, store_frequencies = True):if entry.unique and store_frequencies:self.write
_id('%s|%s' % (entry.id, 'frequency:%d' % len(entry.ids)))else:self.write_id(ent
ry.id)self.write_seq(entry.seq, split)def write_id(self, id):self.output_file_ob
j.write('>%s\n' % id
>temp_575|source:-*- coding: utf-8 -*-|start:2264|stop:2764
sha1(self.seq.upper()).hexdigest()if hash in self.unique_hash_dict:self.unique_h
ash_dict[hash]['ids'].append(self.id)self.unique_hash_dict[hash]['count'] += 1el
se:self.unique_hash_dict[hash] = {'id' : self.id,'ids': [self.id],'seq': self.se
q,'count': 1}self.unique_hash_list = [i[1] for i in sorted([(self.unique_hash_di
ct[hash]['count'], hash)\for hash in self.unique_hash_dict], reverse = True)]sel
f.total_unique = len(self.unique_hash_dict)self.reset()def next(self):if self.un
ique:return self.nex
>temp_576|source:-*- coding: utf-8 -*-|start:1771|stop:2271
case = allow_mixed_caseself.pos = 0self.id  = Noneself.seq = Noneself.ids = []se
lf.unique = uniqueself.unique_hash_dict = {}self.unique_hash_list = []self.uniqu
e_next_hash = 0self.file_pointer = open(self.fasta_file_path)self.file_pointer.s
eek(0)if self.lazy_init:self.total_seq = Noneelse:self.total_seq = len([l for l 
in self.file_pointer.readlines() if l.startswith('>')])self.reset()if self.uniqu
e:self.init_unique_hash()def init_unique_hash(self):while self.next_regular():ha
sh = hashlib.sha1(se
>temp_577|source:-*- coding: utf-8 -*-|start:4641|stop:5141
ridSpec(10, 1)ax1 = plt.subplot(gs[0:8])plt.grid(True)plt.subplots_adjust(left=0
.05, bottom = 0.03, top = 0.95, right = 0.98)plt.plot(seq_len_distribution, colo
r = 'black', alpha = 0.3)plt.fill_between(range(0, max_seq_len + 1), seq_len_dis
tribution, y2 = 0, color = 'black', alpha = 0.15)plt.ylabel('number of sequences
')plt.xlabel('sequence length')if xtickstep == None:xtickstep = (max_seq_len / 5
0) or 1if ytickstep == None:ytickstep = max(seq_len_distribution) / 20 or 1plt.x
ticks(range(xtickste
>temp_578|source:-*- coding: utf-8 -*-|start:4645|stop:5145
pec(10, 1)ax1 = plt.subplot(gs[0:8])plt.grid(True)plt.subplots_adjust(left=0.05,
 bottom = 0.03, top = 0.95, right = 0.98)plt.plot(seq_len_distribution, color = 
'black', alpha = 0.3)plt.fill_between(range(0, max_seq_len + 1), seq_len_distrib
ution, y2 = 0, color = 'black', alpha = 0.15)plt.ylabel('number of sequences')pl
t.xlabel('sequence length')if xtickstep == None:xtickstep = (max_seq_len / 50) o
r 1if ytickstep == None:ytickstep = max(seq_len_distribution) / 20 or 1plt.xtick
s(range(xtickstep, m
>temp_579|source:-*- coding: utf-8 -*-|start:6264|stop:6764
als_int = Noneself.ids = []self.file_pointer = open(self.quals_file_path)self.fi
le_pointer.seek(0)if self.lazy_init:self.total_quals = Noneelse:self.total_quals
 = len([l for l in self.file_pointer.readlines() if l.startswith('>')])self.rese
t()def next(self):self.id = self.file_pointer.readline()[1:].strip()self.quals =
 Noneself.quals_int = Nonequalscores = ''while 1:line = self.file_pointer.readli
ne()if not line:if len(qualscores):self.quals = qualscores.strip()self.quals_int
 = [int(q) for q in 
>temp_580|source:-*- coding: utf-8 -*-|start:808|stop:1308
e_seq(self, seq, split = True):if split:seq = self.split(seq)self.output_file_ob
j.write('%s\n' % seq)def split(self, sequence, piece_length = 80):ticks = range(
0, len(sequence), piece_length) + [len(sequence)]return '\n'.join([sequence[tick
s[x]:ticks[x + 1]] for x in range(0, len(ticks) - 1)])def close(self):self.outpu
t_file_obj.close()class ReadFasta:def __init__(self, f_name):self.ids = []self.s
equences = []self.fasta = SequenceSource(f_name)while self.fasta.next():if self.
fasta.pos % 1000 == 
>temp_581|source:-*- coding: utf-8 -*-|start:6684|stop:7184
en(qualscores):self.quals = qualscores.strip()self.quals_int = [int(q) for q in 
self.quals.split()]self.pos += 1return Trueelse:return Falseif line.startswith('
>'):self.file_pointer.seek(self.file_pointer.tell() - len(line))breakqualscores 
+= ' ' + line.strip()self.quals = qualscores.strip()self.quals_int = [int(q) for
 q in self.quals.split()]self.pos += 1return Truedef close(self):self.file_point
er.close()def reset(self):self.pos = 0self.id  = Noneself.quals = Noneself.quals
_int = Noneself.ids 
>temp_582|source:-*- coding: utf-8 -*-|start:6100|stop:6600
als_file_path, lazy_init = True):self.quals_file_path = quals_file_pathself.name
 = Noneself.lazy_init = lazy_initself.pos = 0self.id  = Noneself.quals = Nonesel
f.quals_int = Noneself.ids = []self.file_pointer = open(self.quals_file_path)sel
f.file_pointer.seek(0)if self.lazy_init:self.total_quals = Noneelse:self.total_q
uals = len([l for l in self.file_pointer.readlines() if l.startswith('>')])self.
reset()def next(self):self.id = self.file_pointer.readline()[1:].strip()self.qua
ls = Noneself.quals_
>temp_583|source:-*- coding: utf-8 -*-|start:241|stop:741
 License, or (at your option)# any later version.## Please read the docs/COPYING
 file.import sysimport numpyimport hashlibclass FastaOutput:def __init__(self, o
utput_file_path):self.output_file_path = output_file_pathself.output_file_obj = 
open(output_file_path, 'w')def store(self, entry, split = True, store_frequencie
s = True):if entry.unique and store_frequencies:self.write_id('%s|%s' % (entry.i
d, 'frequency:%d' % len(entry.ids)))else:self.write_id(entry.id)self.write_seq(e
ntry.seq, split)def 
>temp_584|source:-*- coding: utf-8 -*-|start:3570|stop:4070
pper()self.pos += 1return Truedef get_seq_by_read_id(self, read_id):self.reset()
while self.next():if self.id == read_id:return self.seqreturn Falsedef close(sel
f):self.file_pointer.close()def reset(self):self.pos = 0self.id  = Noneself.seq 
= Noneself.ids = []self.file_pointer.seek(0)def visualize_sequence_length_distri
bution(self, title, dest = None, max_seq_len = None, xtickstep = None, ytickstep
 = None):import matplotlib.pyplot as pltimport matplotlib.gridspec as gridspecse
quence_lengths = []s
>temp_585|source:-*- coding: utf-8 -*-|start:5129|stop:5629
nge(xtickstep, max_seq_len + 1, xtickstep), rotation=90, size='xx-small')plt.yti
cks(range(0, max(seq_len_distribution) + 1, ytickstep),[y for y in range(0, max(
seq_len_distribution) + 1, ytickstep)],size='xx-small')plt.xlim(xmin = 0, xmax =
 max_seq_len)plt.ylim(ymin = 0, ymax = max(seq_len_distribution) + (max(seq_len_
distribution) / 20.0))plt.figtext(0.5, 0.96, '%s' % (title), weight = 'black', s
ize = 'xx-large', ha = 'center')ax1 = plt.subplot(gs[9])plt.rcParams.update({'ax
es.edgecolor' : 20})
>temp_586|source:-*- coding: utf-8 -*-|start:4853|stop:5353
ax_seq_len + 1), seq_len_distribution, y2 = 0, color = 'black', alpha = 0.15)plt
.ylabel('number of sequences')plt.xlabel('sequence length')if xtickstep == None:
xtickstep = (max_seq_len / 50) or 1if ytickstep == None:ytickstep = max(seq_len_
distribution) / 20 or 1plt.xticks(range(xtickstep, max_seq_len + 1, xtickstep), 
rotation=90, size='xx-small')plt.yticks(range(0, max(seq_len_distribution) + 1, 
ytickstep),[y for y in range(0, max(seq_len_distribution) + 1, ytickstep)],size=
'xx-small')plt.xlim(
>temp_587|source:-*- coding: utf-8 -*-|start:1494|stop:1994
.seq)sys.stderr.write('\n')def close(self):self.fasta.close()class SequenceSourc
e():def __init__(self, fasta_file_path, lazy_init = True, unique = False, allow_
mixed_case = False):self.fasta_file_path = fasta_file_pathself.name = Noneself.l
azy_init = lazy_initself.allow_mixed_case = allow_mixed_caseself.pos = 0self.id 
 = Noneself.seq = Noneself.ids = []self.unique = uniqueself.unique_hash_dict = {
}self.unique_hash_list = []self.unique_next_hash = 0self.file_pointer = open(sel
f.fasta_file_path)se
>temp_588|source:-*- coding: utf-8 -*-|start:1299|stop:1799
 1000 == 0 or self.fasta.pos == 1:sys.stderr.write('\r[fastalib] Reading FASTA i
nto memory: %s' % (self.fasta.pos))sys.stderr.flush()self.ids.append(self.fasta.
id)self.sequences.append(self.fasta.seq)sys.stderr.write('\n')def close(self):se
lf.fasta.close()class SequenceSource():def __init__(self, fasta_file_path, lazy_
init = True, unique = False, allow_mixed_case = False):self.fasta_file_path = fa
sta_file_pathself.name = Noneself.lazy_init = lazy_initself.allow_mixed_case = a
llow_mixed_caseself.
>temp_589|source:-*- coding: utf-8 -*-|start:6537|stop:7037
file_pointer.readline()[1:].strip()self.quals = Noneself.quals_int = Nonequalsco
res = ''while 1:line = self.file_pointer.readline()if not line:if len(qualscores
):self.quals = qualscores.strip()self.quals_int = [int(q) for q in self.quals.sp
lit()]self.pos += 1return Trueelse:return Falseif line.startswith('>'):self.file
_pointer.seek(self.file_pointer.tell() - len(line))breakqualscores += ' ' + line
.strip()self.quals = qualscores.strip()self.quals_int = [int(q) for q in self.qu
als.split()]self.pos
>temp_590|source:-*- coding: utf-8 -*-|start:1976|stop:2476
fasta_file_path)self.file_pointer.seek(0)if self.lazy_init:self.total_seq = None
else:self.total_seq = len([l for l in self.file_pointer.readlines() if l.startsw
ith('>')])self.reset()if self.unique:self.init_unique_hash()def init_unique_hash
(self):while self.next_regular():hash = hashlib.sha1(self.seq.upper()).hexdigest
()if hash in self.unique_hash_dict:self.unique_hash_dict[hash]['ids'].append(sel
f.id)self.unique_hash_dict[hash]['count'] += 1else:self.unique_hash_dict[hash] =
 {'id' : self.id,'id
>temp_591|source:-*- coding: utf-8 -*-|start:5563|stop:6063
1 = plt.subplot(gs[9])plt.rcParams.update({'axes.edgecolor' : 20})plt.grid(False
)plt.yticks([])plt.xticks([])plt.text(0.02, 0.5, 'total: %s / mean: %.2f / std: 
%.2f / min: %s / max: %s'\% (len(sequence_lengths),numpy.mean(sequence_lengths),
 numpy.std(sequence_lengths),\min(sequence_lengths),\max(sequence_lengths)),\va 
= 'center', alpha = 0.8, size = 'x-large')if dest == None:dest = self.fasta_file
_pathtry:plt.savefig(dest + '.pdf')except:plt.savefig(dest + '.png')try:plt.show
()except:passreturnc
>temp_592|source:-*- coding: utf-8 -*-|start:4200|stop:4700
s.stderr.flush()sequence_lengths.append(len(self.seq))self.reset()sys.stderr.wri
te('\n')if not max_seq_len:max_seq_len = max(sequence_lengths) + (int(max(sequen
ce_lengths) / 100.0) or 10)seq_len_distribution = [0] * (max_seq_len + 1)for l i
n sequence_lengths:seq_len_distribution[l] += 1fig = plt.figure(figsize = (16, 1
2))plt.rcParams.update({'axes.linewidth' : 0.9})plt.rc('grid', color='0.50', lin
estyle='-', linewidth=0.1)gs = gridspec.GridSpec(10, 1)ax1 = plt.subplot(gs[0:8]
)plt.grid(True)plt.s
>temp_593|source:-*- coding: utf-8 -*-|start:2225|stop:2725
ile self.next_regular():hash = hashlib.sha1(self.seq.upper()).hexdigest()if hash
 in self.unique_hash_dict:self.unique_hash_dict[hash]['ids'].append(self.id)self
.unique_hash_dict[hash]['count'] += 1else:self.unique_hash_dict[hash] = {'id' : 
self.id,'ids': [self.id],'seq': self.seq,'count': 1}self.unique_hash_list = [i[1
] for i in sorted([(self.unique_hash_dict[hash]['count'], hash)\for hash in self
.unique_hash_dict], reverse = True)]self.total_unique = len(self.unique_hash_dic
t)self.reset()def ne
>temp_594|source:-*- coding: utf-8 -*-|start:868|stop:1368
)self.output_file_obj.write('%s\n' % seq)def split(self, sequence, piece_length 
= 80):ticks = range(0, len(sequence), piece_length) + [len(sequence)]return '\n'
.join([sequence[ticks[x]:ticks[x + 1]] for x in range(0, len(ticks) - 1)])def cl
ose(self):self.output_file_obj.close()class ReadFasta:def __init__(self, f_name)
:self.ids = []self.sequences = []self.fasta = SequenceSource(f_name)while self.f
asta.next():if self.fasta.pos % 1000 == 0 or self.fasta.pos == 1:sys.stderr.writ
e('\r[fastalib] Read
>temp_595|source:-*- coding: utf-8 -*-|start:3193|stop:3693
eq = Noneself.id = self.file_pointer.readline()[1:].strip()sequence = ''while 1:
line = self.file_pointer.readline()if not line:if len(sequence):self.seq = seque
nceself.pos += 1return Trueelse:return Falseif line.startswith('>'):self.file_po
inter.seek(self.file_pointer.tell() - len(line))breaksequence += line.strip()sel
f.seq = sequence if self.allow_mixed_case else sequence.upper()self.pos += 1retu
rn Truedef get_seq_by_read_id(self, read_id):self.reset()while self.next():if se
lf.id == read_id:ret
>temp_596|source:-*- coding: utf-8 -*-|start:4632|stop:5132
ridspec.GridSpec(10, 1)ax1 = plt.subplot(gs[0:8])plt.grid(True)plt.subplots_adju
st(left=0.05, bottom = 0.03, top = 0.95, right = 0.98)plt.plot(seq_len_distribut
ion, color = 'black', alpha = 0.3)plt.fill_between(range(0, max_seq_len + 1), se
q_len_distribution, y2 = 0, color = 'black', alpha = 0.15)plt.ylabel('number of 
sequences')plt.xlabel('sequence length')if xtickstep == None:xtickstep = (max_se
q_len / 50) or 1if ytickstep == None:ytickstep = max(seq_len_distribution) / 20 
or 1plt.xticks(range
>temp_597|source:-*- coding: utf-8 -*-|start:3515|stop:4015
seq = sequence if self.allow_mixed_case else sequence.upper()self.pos += 1return
 Truedef get_seq_by_read_id(self, read_id):self.reset()while self.next():if self
.id == read_id:return self.seqreturn Falsedef close(self):self.file_pointer.clos
e()def reset(self):self.pos = 0self.id  = Noneself.seq = Noneself.ids = []self.f
ile_pointer.seek(0)def visualize_sequence_length_distribution(self, title, dest 
= None, max_seq_len = None, xtickstep = None, ytickstep = None):import matplotli
b.pyplot as pltimpor
>temp_598|source:-*- coding: utf-8 -*-|start:2088|stop:2588
 l in self.file_pointer.readlines() if l.startswith('>')])self.reset()if self.un
ique:self.init_unique_hash()def init_unique_hash(self):while self.next_regular()
:hash = hashlib.sha1(self.seq.upper()).hexdigest()if hash in self.unique_hash_di
ct:self.unique_hash_dict[hash]['ids'].append(self.id)self.unique_hash_dict[hash]
['count'] += 1else:self.unique_hash_dict[hash] = {'id' : self.id,'ids': [self.id
],'seq': self.seq,'count': 1}self.unique_hash_list = [i[1] for i in sorted([(sel
f.unique_hash_dict[h
>temp_599|source:-*- coding: utf-8 -*-|start:6338|stop:6838
elf.file_pointer.seek(0)if self.lazy_init:self.total_quals = Noneelse:self.total
_quals = len([l for l in self.file_pointer.readlines() if l.startswith('>')])sel
f.reset()def next(self):self.id = self.file_pointer.readline()[1:].strip()self.q
uals = Noneself.quals_int = Nonequalscores = ''while 1:line = self.file_pointer.
readline()if not line:if len(qualscores):self.quals = qualscores.strip()self.qua
ls_int = [int(q) for q in self.quals.split()]self.pos += 1return Trueelse:return
 Falseif line.starts
>temp_600|source:-*- coding: utf-8 -*-|start:1918|stop:2418
 []self.unique_next_hash = 0self.file_pointer = open(self.fasta_file_path)self.f
ile_pointer.seek(0)if self.lazy_init:self.total_seq = Noneelse:self.total_seq = 
len([l for l in self.file_pointer.readlines() if l.startswith('>')])self.reset()
if self.unique:self.init_unique_hash()def init_unique_hash(self):while self.next
_regular():hash = hashlib.sha1(self.seq.upper()).hexdigest()if hash in self.uniq
ue_hash_dict:self.unique_hash_dict[hash]['ids'].append(self.id)self.unique_hash_
dict[hash]['count'] 
>temp_601|source:-*- coding: utf-8 -*-|start:3358|stop:3858
lf.pos += 1return Trueelse:return Falseif line.startswith('>'):self.file_pointer
.seek(self.file_pointer.tell() - len(line))breaksequence += line.strip()self.seq
 = sequence if self.allow_mixed_case else sequence.upper()self.pos += 1return Tr
uedef get_seq_by_read_id(self, read_id):self.reset()while self.next():if self.id
 == read_id:return self.seqreturn Falsedef close(self):self.file_pointer.close()
def reset(self):self.pos = 0self.id  = Noneself.seq = Noneself.ids = []self.file
_pointer.seek(0)def 
>temp_602|source:-*- coding: utf-8 -*-|start:4431|stop:4931
1)for l in sequence_lengths:seq_len_distribution[l] += 1fig = plt.figure(figsize
 = (16, 12))plt.rcParams.update({'axes.linewidth' : 0.9})plt.rc('grid', color='0
.50', linestyle='-', linewidth=0.1)gs = gridspec.GridSpec(10, 1)ax1 = plt.subplo
t(gs[0:8])plt.grid(True)plt.subplots_adjust(left=0.05, bottom = 0.03, top = 0.95
, right = 0.98)plt.plot(seq_len_distribution, color = 'black', alpha = 0.3)plt.f
ill_between(range(0, max_seq_len + 1), seq_len_distribution, y2 = 0, color = 'bl
ack', alpha = 0.15)p
>temp_603|source:-*- coding: utf-8 -*-|start:3534|stop:4034
elf.allow_mixed_case else sequence.upper()self.pos += 1return Truedef get_seq_by
_read_id(self, read_id):self.reset()while self.next():if self.id == read_id:retu
rn self.seqreturn Falsedef close(self):self.file_pointer.close()def reset(self):
self.pos = 0self.id  = Noneself.seq = Noneself.ids = []self.file_pointer.seek(0)
def visualize_sequence_length_distribution(self, title, dest = None, max_seq_len
 = None, xtickstep = None, ytickstep = None):import matplotlib.pyplot as pltimpo
rt matplotlib.gridsp
>temp_604|source:-*- coding: utf-8 -*-|start:3888|stop:4388
ribution(self, title, dest = None, max_seq_len = None, xtickstep = None, ytickst
ep = None):import matplotlib.pyplot as pltimport matplotlib.gridspec as gridspec
sequence_lengths = []self.reset()while self.next():if self.pos % 10000 == 0 or s
elf.pos == 1:sys.stderr.write('\r[fastalib] Reading: %s' % (self.pos))sys.stderr
.flush()sequence_lengths.append(len(self.seq))self.reset()sys.stderr.write('\n')
if not max_seq_len:max_seq_len = max(sequence_lengths) + (int(max(sequence_lengt
hs) / 100.0) or 10)s
>temp_605|source:-*- coding: utf-8 -*-|start:3714|stop:4214
lsedef close(self):self.file_pointer.close()def reset(self):self.pos = 0self.id 
 = Noneself.seq = Noneself.ids = []self.file_pointer.seek(0)def visualize_sequen
ce_length_distribution(self, title, dest = None, max_seq_len = None, xtickstep =
 None, ytickstep = None):import matplotlib.pyplot as pltimport matplotlib.gridsp
ec as gridspecsequence_lengths = []self.reset()while self.next():if self.pos % 1
0000 == 0 or self.pos == 1:sys.stderr.write('\r[fastalib] Reading: %s' % (self.p
os))sys.stderr.flush
>temp_606|source:-*- coding: utf-8 -*-|start:6274|stop:6774
Noneself.ids = []self.file_pointer = open(self.quals_file_path)self.file_pointer
.seek(0)if self.lazy_init:self.total_quals = Noneelse:self.total_quals = len([l 
for l in self.file_pointer.readlines() if l.startswith('>')])self.reset()def nex
t(self):self.id = self.file_pointer.readline()[1:].strip()self.quals = Noneself.
quals_int = Nonequalscores = ''while 1:line = self.file_pointer.readline()if not
 line:if len(qualscores):self.quals = qualscores.strip()self.quals_int = [int(q)
 for q in self.quals
>temp_607|source:-*- coding: utf-8 -*-|start:3880|stop:4380
gth_distribution(self, title, dest = None, max_seq_len = None, xtickstep = None,
 ytickstep = None):import matplotlib.pyplot as pltimport matplotlib.gridspec as 
gridspecsequence_lengths = []self.reset()while self.next():if self.pos % 10000 =
= 0 or self.pos == 1:sys.stderr.write('\r[fastalib] Reading: %s' % (self.pos))sy
s.stderr.flush()sequence_lengths.append(len(self.seq))self.reset()sys.stderr.wri
te('\n')if not max_seq_len:max_seq_len = max(sequence_lengths) + (int(max(sequen
ce_lengths) / 100.0)
>temp_608|source:-*- coding: utf-8 -*-|start:4617|stop:5117
idth=0.1)gs = gridspec.GridSpec(10, 1)ax1 = plt.subplot(gs[0:8])plt.grid(True)pl
t.subplots_adjust(left=0.05, bottom = 0.03, top = 0.95, right = 0.98)plt.plot(se
q_len_distribution, color = 'black', alpha = 0.3)plt.fill_between(range(0, max_s
eq_len + 1), seq_len_distribution, y2 = 0, color = 'black', alpha = 0.15)plt.yla
bel('number of sequences')plt.xlabel('sequence length')if xtickstep == None:xtic
kstep = (max_seq_len / 50) or 1if ytickstep == None:ytickstep = max(seq_len_dist
ribution) / 20 or 1p
>temp_609|source:-*- coding: utf-8 -*-|start:564|stop:1064
 True):if entry.unique and store_frequencies:self.write_id('%s|%s' % (entry.id, 
'frequency:%d' % len(entry.ids)))else:self.write_id(entry.id)self.write_seq(entr
y.seq, split)def write_id(self, id):self.output_file_obj.write('>%s\n' % id)def 
write_seq(self, seq, split = True):if split:seq = self.split(seq)self.output_fil
e_obj.write('%s\n' % seq)def split(self, sequence, piece_length = 80):ticks = ra
nge(0, len(sequence), piece_length) + [len(sequence)]return '\n'.join([sequence[
ticks[x]:ticks[x + 1
>temp_610|source:-*- coding: utf-8 -*-|start:2675|stop:3175
que = len(self.unique_hash_dict)self.reset()def next(self):if self.unique:return
 self.next_unique()else:return self.next_regular()def next_unique(self):if self.
unique:if self.total_unique > 0 and self.pos < self.total_unique:hash_entry = se
lf.unique_hash_dict[self.unique_hash_list[self.pos]]self.pos += 1self.seq = hash
_entry['seq'] if self.allow_mixed_case else hash_entry['seq'].upper()self.id  = 
hash_entry['id']self.ids = hash_entry['ids']return Trueelse:return Falseelse:ret
urn Falsedef next_re
>temp_611|source:-*- coding: utf-8 -*-|start:677|stop:1177
else:self.write_id(entry.id)self.write_seq(entry.seq, split)def write_id(self, i
d):self.output_file_obj.write('>%s\n' % id)def write_seq(self, seq, split = True
):if split:seq = self.split(seq)self.output_file_obj.write('%s\n' % seq)def spli
t(self, sequence, piece_length = 80):ticks = range(0, len(sequence), piece_lengt
h) + [len(sequence)]return '\n'.join([sequence[ticks[x]:ticks[x + 1]] for x in r
ange(0, len(ticks) - 1)])def close(self):self.output_file_obj.close()class ReadF
asta:def __init__(se
>temp_612|source:-*- coding: utf-8 -*-|start:6120|stop:6620
init = True):self.quals_file_path = quals_file_pathself.name = Noneself.lazy_ini
t = lazy_initself.pos = 0self.id  = Noneself.quals = Noneself.quals_int = Nonese
lf.ids = []self.file_pointer = open(self.quals_file_path)self.file_pointer.seek(
0)if self.lazy_init:self.total_quals = Noneelse:self.total_quals = len([l for l 
in self.file_pointer.readlines() if l.startswith('>')])self.reset()def next(self
):self.id = self.file_pointer.readline()[1:].strip()self.quals = Noneself.quals_
int = Nonequalscores
>temp_613|source:-*- coding: utf-8 -*-|start:3009|stop:3509
if self.allow_mixed_case else hash_entry['seq'].upper()self.id  = hash_entry['id
']self.ids = hash_entry['ids']return Trueelse:return Falseelse:return Falsedef n
ext_regular(self):self.seq = Noneself.id = self.file_pointer.readline()[1:].stri
p()sequence = ''while 1:line = self.file_pointer.readline()if not line:if len(se
quence):self.seq = sequenceself.pos += 1return Trueelse:return Falseif line.star
tswith('>'):self.file_pointer.seek(self.file_pointer.tell() - len(line))breakseq
uence += line.strip(
>temp_614|source:-*- coding: utf-8 -*-|start:340|stop:840
ort numpyimport hashlibclass FastaOutput:def __init__(self, output_file_path):se
lf.output_file_path = output_file_pathself.output_file_obj = open(output_file_pa
th, 'w')def store(self, entry, split = True, store_frequencies = True):if entry.
unique and store_frequencies:self.write_id('%s|%s' % (entry.id, 'frequency:%d' %
 len(entry.ids)))else:self.write_id(entry.id)self.write_seq(entry.seq, split)def
 write_id(self, id):self.output_file_obj.write('>%s\n' % id)def write_seq(self, 
seq, split = True):i
>temp_615|source:-*- coding: utf-8 -*-|start:5694|stop:6194
otal: %s / mean: %.2f / std: %.2f / min: %s / max: %s'\% (len(sequence_lengths),
numpy.mean(sequence_lengths), numpy.std(sequence_lengths),\min(sequence_lengths)
,\max(sequence_lengths)),\va = 'center', alpha = 0.8, size = 'x-large')if dest =
= None:dest = self.fasta_file_pathtry:plt.savefig(dest + '.pdf')except:plt.savef
ig(dest + '.png')try:plt.show()except:passreturnclass QualSource:def __init__(se
lf, quals_file_path, lazy_init = True):self.quals_file_path = quals_file_pathsel
f.name = Noneself.la
>temp_616|source:-*- coding: utf-8 -*-|start:5648|stop:6148
yticks([])plt.xticks([])plt.text(0.02, 0.5, 'total: %s / mean: %.2f / std: %.2f 
/ min: %s / max: %s'\% (len(sequence_lengths),numpy.mean(sequence_lengths), nump
y.std(sequence_lengths),\min(sequence_lengths),\max(sequence_lengths)),\va = 'ce
nter', alpha = 0.8, size = 'x-large')if dest == None:dest = self.fasta_file_path
try:plt.savefig(dest + '.pdf')except:plt.savefig(dest + '.png')try:plt.show()exc
ept:passreturnclass QualSource:def __init__(self, quals_file_path, lazy_init = T
rue):self.quals_file
>temp_617|source:-*- coding: utf-8 -*-|start:3710|stop:4210
n Falsedef close(self):self.file_pointer.close()def reset(self):self.pos = 0self
.id  = Noneself.seq = Noneself.ids = []self.file_pointer.seek(0)def visualize_se
quence_length_distribution(self, title, dest = None, max_seq_len = None, xtickst
ep = None, ytickstep = None):import matplotlib.pyplot as pltimport matplotlib.gr
idspec as gridspecsequence_lengths = []self.reset()while self.next():if self.pos
 % 10000 == 0 or self.pos == 1:sys.stderr.write('\r[fastalib] Reading: %s' % (se
lf.pos))sys.stderr.f
>temp_618|source:-*- coding: utf-8 -*-|start:302|stop:802
ad the docs/COPYING file.import sysimport numpyimport hashlibclass FastaOutput:d
ef __init__(self, output_file_path):self.output_file_path = output_file_pathself
.output_file_obj = open(output_file_path, 'w')def store(self, entry, split = Tru
e, store_frequencies = True):if entry.unique and store_frequencies:self.write_id
('%s|%s' % (entry.id, 'frequency:%d' % len(entry.ids)))else:self.write_id(entry.
id)self.write_seq(entry.seq, split)def write_id(self, id):self.output_file_obj.w
rite('>%s\n' % id)de
>temp_619|source:-*- coding: utf-8 -*-|start:1545|stop:2045
ta.close()class SequenceSource():def __init__(self, fasta_file_path, lazy_init =
 True, unique = False, allow_mixed_case = False):self.fasta_file_path = fasta_fi
le_pathself.name = Noneself.lazy_init = lazy_initself.allow_mixed_case = allow_m
ixed_caseself.pos = 0self.id  = Noneself.seq = Noneself.ids = []self.unique = un
iqueself.unique_hash_dict = {}self.unique_hash_list = []self.unique_next_hash = 
0self.file_pointer = open(self.fasta_file_path)self.file_pointer.seek(0)if self.
lazy_init:self.total
>temp_620|source:-*- coding: utf-8 -*-|start:2023|stop:2523
f.lazy_init:self.total_seq = Noneelse:self.total_seq = len([l for l in self.file
_pointer.readlines() if l.startswith('>')])self.reset()if self.unique:self.init_
unique_hash()def init_unique_hash(self):while self.next_regular():hash = hashlib
.sha1(self.seq.upper()).hexdigest()if hash in self.unique_hash_dict:self.unique_
hash_dict[hash]['ids'].append(self.id)self.unique_hash_dict[hash]['count'] += 1e
lse:self.unique_hash_dict[hash] = {'id' : self.id,'ids': [self.id],'seq': self.s
eq,'count': 1}self.u
>temp_621|source:-*- coding: utf-8 -*-|start:3088|stop:3588
d']self.ids = hash_entry['ids']return Trueelse:return Falseelse:return Falsedef 
next_regular(self):self.seq = Noneself.id = self.file_pointer.readline()[1:].str
ip()sequence = ''while 1:line = self.file_pointer.readline()if not line:if len(s
equence):self.seq = sequenceself.pos += 1return Trueelse:return Falseif line.sta
rtswith('>'):self.file_pointer.seek(self.file_pointer.tell() - len(line))breakse
quence += line.strip()self.seq = sequence if self.allow_mixed_case else sequence
.upper()self.pos += 
>temp_622|source:-*- coding: utf-8 -*-|start:2672|stop:3172
unique = len(self.unique_hash_dict)self.reset()def next(self):if self.unique:ret
urn self.next_unique()else:return self.next_regular()def next_unique(self):if se
lf.unique:if self.total_unique > 0 and self.pos < self.total_unique:hash_entry =
 self.unique_hash_dict[self.unique_hash_list[self.pos]]self.pos += 1self.seq = h
ash_entry['seq'] if self.allow_mixed_case else hash_entry['seq'].upper()self.id 
 = hash_entry['id']self.ids = hash_entry['ids']return Trueelse:return Falseelse:
return Falsedef next
>temp_623|source:-*- coding: utf-8 -*-|start:2489|stop:2989
,'seq': self.seq,'count': 1}self.unique_hash_list = [i[1] for i in sorted([(self
.unique_hash_dict[hash]['count'], hash)\for hash in self.unique_hash_dict], reve
rse = True)]self.total_unique = len(self.unique_hash_dict)self.reset()def next(s
elf):if self.unique:return self.next_unique()else:return self.next_regular()def 
next_unique(self):if self.unique:if self.total_unique > 0 and self.pos < self.to
tal_unique:hash_entry = self.unique_hash_dict[self.unique_hash_list[self.pos]]se
lf.pos += 1self.seq 
>temp_624|source:-*- coding: utf-8 -*-|start:2740|stop:3240
f.unique:return self.next_unique()else:return self.next_regular()def next_unique
(self):if self.unique:if self.total_unique > 0 and self.pos < self.total_unique:
hash_entry = self.unique_hash_dict[self.unique_hash_list[self.pos]]self.pos += 1
self.seq = hash_entry['seq'] if self.allow_mixed_case else hash_entry['seq'].upp
er()self.id  = hash_entry['id']self.ids = hash_entry['ids']return Trueelse:retur
n Falseelse:return Falsedef next_regular(self):self.seq = Noneself.id = self.fil
e_pointer.readline()
>temp_625|source:-*- coding: utf-8 -*-|start:850|stop:1350
q = self.split(seq)self.output_file_obj.write('%s\n' % seq)def split(self, seque
nce, piece_length = 80):ticks = range(0, len(sequence), piece_length) + [len(seq
uence)]return '\n'.join([sequence[ticks[x]:ticks[x + 1]] for x in range(0, len(t
icks) - 1)])def close(self):self.output_file_obj.close()class ReadFasta:def __in
it__(self, f_name):self.ids = []self.sequences = []self.fasta = SequenceSource(f
_name)while self.fasta.next():if self.fasta.pos % 1000 == 0 or self.fasta.pos ==
 1:sys.stderr.write(
>temp_626|source:-*- coding: utf-8 -*-|start:3687|stop:4187
id:return self.seqreturn Falsedef close(self):self.file_pointer.close()def reset
(self):self.pos = 0self.id  = Noneself.seq = Noneself.ids = []self.file_pointer.
seek(0)def visualize_sequence_length_distribution(self, title, dest = None, max_
seq_len = None, xtickstep = None, ytickstep = None):import matplotlib.pyplot as 
pltimport matplotlib.gridspec as gridspecsequence_lengths = []self.reset()while 
self.next():if self.pos % 10000 == 0 or self.pos == 1:sys.stderr.write('\r[fasta
lib] Reading: %s' % 
>temp_627|source:-*- coding: utf-8 -*-|start:977|stop:1477
quence), piece_length) + [len(sequence)]return '\n'.join([sequence[ticks[x]:tick
s[x + 1]] for x in range(0, len(ticks) - 1)])def close(self):self.output_file_ob
j.close()class ReadFasta:def __init__(self, f_name):self.ids = []self.sequences 
= []self.fasta = SequenceSource(f_name)while self.fasta.next():if self.fasta.pos
 % 1000 == 0 or self.fasta.pos == 1:sys.stderr.write('\r[fastalib] Reading FASTA
 into memory: %s' % (self.fasta.pos))sys.stderr.flush()self.ids.append(self.fast
a.id)self.sequences.
>temp_628|source:-*- coding: utf-8 -*-|start:2032|stop:2532
it:self.total_seq = Noneelse:self.total_seq = len([l for l in self.file_pointer.
readlines() if l.startswith('>')])self.reset()if self.unique:self.init_unique_ha
sh()def init_unique_hash(self):while self.next_regular():hash = hashlib.sha1(sel
f.seq.upper()).hexdigest()if hash in self.unique_hash_dict:self.unique_hash_dict
[hash]['ids'].append(self.id)self.unique_hash_dict[hash]['count'] += 1else:self.
unique_hash_dict[hash] = {'id' : self.id,'ids': [self.id],'seq': self.seq,'count
': 1}self.unique_has
>temp_629|source:-*- coding: utf-8 -*-|start:5565|stop:6065
= plt.subplot(gs[9])plt.rcParams.update({'axes.edgecolor' : 20})plt.grid(False)p
lt.yticks([])plt.xticks([])plt.text(0.02, 0.5, 'total: %s / mean: %.2f / std: %.
2f / min: %s / max: %s'\% (len(sequence_lengths),numpy.mean(sequence_lengths), n
umpy.std(sequence_lengths),\min(sequence_lengths),\max(sequence_lengths)),\va = 
'center', alpha = 0.8, size = 'x-large')if dest == None:dest = self.fasta_file_p
athtry:plt.savefig(dest + '.pdf')except:plt.savefig(dest + '.png')try:plt.show()
except:passreturncla
>temp_630|source:-*- coding: utf-8 -*-|start:1719|stop:2219
me = Noneself.lazy_init = lazy_initself.allow_mixed_case = allow_mixed_caseself.
pos = 0self.id  = Noneself.seq = Noneself.ids = []self.unique = uniqueself.uniqu
e_hash_dict = {}self.unique_hash_list = []self.unique_next_hash = 0self.file_poi
nter = open(self.fasta_file_path)self.file_pointer.seek(0)if self.lazy_init:self
.total_seq = Noneelse:self.total_seq = len([l for l in self.file_pointer.readlin
es() if l.startswith('>')])self.reset()if self.unique:self.init_unique_hash()def
 init_unique_hash(se
>temp_631|source:-*- coding: utf-8 -*-|start:3442|stop:3942
k(self.file_pointer.tell() - len(line))breaksequence += line.strip()self.seq = s
equence if self.allow_mixed_case else sequence.upper()self.pos += 1return Truede
f get_seq_by_read_id(self, read_id):self.reset()while self.next():if self.id == 
read_id:return self.seqreturn Falsedef close(self):self.file_pointer.close()def 
reset(self):self.pos = 0self.id  = Noneself.seq = Noneself.ids = []self.file_poi
nter.seek(0)def visualize_sequence_length_distribution(self, title, dest = None,
 max_seq_len = None,
>temp_632|source:-*- coding: utf-8 -*-|start:3408|stop:3908
rtswith('>'):self.file_pointer.seek(self.file_pointer.tell() - len(line))breakse
quence += line.strip()self.seq = sequence if self.allow_mixed_case else sequence
.upper()self.pos += 1return Truedef get_seq_by_read_id(self, read_id):self.reset
()while self.next():if self.id == read_id:return self.seqreturn Falsedef close(s
elf):self.file_pointer.close()def reset(self):self.pos = 0self.id  = Noneself.se
q = Noneself.ids = []self.file_pointer.seek(0)def visualize_sequence_length_dist
ribution(self, title
>temp_633|source:-*- coding: utf-8 -*-|start:863|stop:1363
t(seq)self.output_file_obj.write('%s\n' % seq)def split(self, sequence, piece_le
ngth = 80):ticks = range(0, len(sequence), piece_length) + [len(sequence)]return
 '\n'.join([sequence[ticks[x]:ticks[x + 1]] for x in range(0, len(ticks) - 1)])d
ef close(self):self.output_file_obj.close()class ReadFasta:def __init__(self, f_
name):self.ids = []self.sequences = []self.fasta = SequenceSource(f_name)while s
elf.fasta.next():if self.fasta.pos % 1000 == 0 or self.fasta.pos == 1:sys.stderr
.write('\r[fastalib]
>temp_634|source:-*- coding: utf-8 -*-|start:509|stop:1009
ef store(self, entry, split = True, store_frequencies = True):if entry.unique an
d store_frequencies:self.write_id('%s|%s' % (entry.id, 'frequency:%d' % len(entr
y.ids)))else:self.write_id(entry.id)self.write_seq(entry.seq, split)def write_id
(self, id):self.output_file_obj.write('>%s\n' % id)def write_seq(self, seq, spli
t = True):if split:seq = self.split(seq)self.output_file_obj.write('%s\n' % seq)
def split(self, sequence, piece_length = 80):ticks = range(0, len(sequence), pie
ce_length) + [len(se
>temp_635|source:-*- coding: utf-8 -*-|start:272|stop:772
any later version.## Please read the docs/COPYING file.import sysimport numpyimp
ort hashlibclass FastaOutput:def __init__(self, output_file_path):self.output_fi
le_path = output_file_pathself.output_file_obj = open(output_file_path, 'w')def 
store(self, entry, split = True, store_frequencies = True):if entry.unique and s
tore_frequencies:self.write_id('%s|%s' % (entry.id, 'frequency:%d' % len(entry.i
ds)))else:self.write_id(entry.id)self.write_seq(entry.seq, split)def write_id(se
lf, id):self.output_
>temp_636|source:-*- coding: utf-8 -*-|start:2952|stop:3452
list[self.pos]]self.pos += 1self.seq = hash_entry['seq'] if self.allow_mixed_cas
e else hash_entry['seq'].upper()self.id  = hash_entry['id']self.ids = hash_entry
['ids']return Trueelse:return Falseelse:return Falsedef next_regular(self):self.
seq = Noneself.id = self.file_pointer.readline()[1:].strip()sequence = ''while 1
:line = self.file_pointer.readline()if not line:if len(sequence):self.seq = sequ
enceself.pos += 1return Trueelse:return Falseif line.startswith('>'):self.file_p
ointer.seek(self.fil
>temp_637|source:-*- coding: utf-8 -*-|start:3335|stop:3835
):self.seq = sequenceself.pos += 1return Trueelse:return Falseif line.startswith
('>'):self.file_pointer.seek(self.file_pointer.tell() - len(line))breaksequence 
+= line.strip()self.seq = sequence if self.allow_mixed_case else sequence.upper(
)self.pos += 1return Truedef get_seq_by_read_id(self, read_id):self.reset()while
 self.next():if self.id == read_id:return self.seqreturn Falsedef close(self):se
lf.file_pointer.close()def reset(self):self.pos = 0self.id  = Noneself.seq = Non
eself.ids = []self.f
>temp_638|source:-*- coding: utf-8 -*-|start:1719|stop:2219
me = Noneself.lazy_init = lazy_initself.allow_mixed_case = allow_mixed_caseself.
pos = 0self.id  = Noneself.seq = Noneself.ids = []self.unique = uniqueself.uniqu
e_hash_dict = {}self.unique_hash_list = []self.unique_next_hash = 0self.file_poi
nter = open(self.fasta_file_path)self.file_pointer.seek(0)if self.lazy_init:self
.total_seq = Noneelse:self.total_seq = len([l for l in self.file_pointer.readlin
es() if l.startswith('>')])self.reset()if self.unique:self.init_unique_hash()def
 init_unique_hash(se
>temp_639|source:-*- coding: utf-8 -*-|start:1184|stop:1684
ame):self.ids = []self.sequences = []self.fasta = SequenceSource(f_name)while se
lf.fasta.next():if self.fasta.pos % 1000 == 0 or self.fasta.pos == 1:sys.stderr.
write('\r[fastalib] Reading FASTA into memory: %s' % (self.fasta.pos))sys.stderr
.flush()self.ids.append(self.fasta.id)self.sequences.append(self.fasta.seq)sys.s
tderr.write('\n')def close(self):self.fasta.close()class SequenceSource():def __
init__(self, fasta_file_path, lazy_init = True, unique = False, allow_mixed_case
 = False):self.fasta
>temp_640|source:-*- coding: utf-8 -*-|start:3180|stop:3680
(self):self.seq = Noneself.id = self.file_pointer.readline()[1:].strip()sequence
 = ''while 1:line = self.file_pointer.readline()if not line:if len(sequence):sel
f.seq = sequenceself.pos += 1return Trueelse:return Falseif line.startswith('>')
:self.file_pointer.seek(self.file_pointer.tell() - len(line))breaksequence += li
ne.strip()self.seq = sequence if self.allow_mixed_case else sequence.upper()self
.pos += 1return Truedef get_seq_by_read_id(self, read_id):self.reset()while self
.next():if self.id =
>temp_641|source:-*- coding: utf-8 -*-|start:4899|stop:5399
 color = 'black', alpha = 0.15)plt.ylabel('number of sequences')plt.xlabel('sequ
ence length')if xtickstep == None:xtickstep = (max_seq_len / 50) or 1if ytickste
p == None:ytickstep = max(seq_len_distribution) / 20 or 1plt.xticks(range(xticks
tep, max_seq_len + 1, xtickstep), rotation=90, size='xx-small')plt.yticks(range(
0, max(seq_len_distribution) + 1, ytickstep),[y for y in range(0, max(seq_len_di
stribution) + 1, ytickstep)],size='xx-small')plt.xlim(xmin = 0, xmax = max_seq_l
en)plt.ylim(ymin = 0
>temp_642|source:-*- coding: utf-8 -*-|start:2782|stop:3282
urn self.next_regular()def next_unique(self):if self.unique:if self.total_unique
 > 0 and self.pos < self.total_unique:hash_entry = self.unique_hash_dict[self.un
ique_hash_list[self.pos]]self.pos += 1self.seq = hash_entry['seq'] if self.allow
_mixed_case else hash_entry['seq'].upper()self.id  = hash_entry['id']self.ids = 
hash_entry['ids']return Trueelse:return Falseelse:return Falsedef next_regular(s
elf):self.seq = Noneself.id = self.file_pointer.readline()[1:].strip()sequence =
 ''while 1:line = se
>temp_643|source:-*- coding: utf-8 -*-|start:6126|stop:6626
 True):self.quals_file_path = quals_file_pathself.name = Noneself.lazy_init = la
zy_initself.pos = 0self.id  = Noneself.quals = Noneself.quals_int = Noneself.ids
 = []self.file_pointer = open(self.quals_file_path)self.file_pointer.seek(0)if s
elf.lazy_init:self.total_quals = Noneelse:self.total_quals = len([l for l in sel
f.file_pointer.readlines() if l.startswith('>')])self.reset()def next(self):self
.id = self.file_pointer.readline()[1:].strip()self.quals = Noneself.quals_int = 
Nonequalscores = ''w
>temp_644|source:-*- coding: utf-8 -*-|start:6647|stop:7147
le_pointer.readline()if not line:if len(qualscores):self.quals = qualscores.stri
p()self.quals_int = [int(q) for q in self.quals.split()]self.pos += 1return True
else:return Falseif line.startswith('>'):self.file_pointer.seek(self.file_pointe
r.tell() - len(line))breakqualscores += ' ' + line.strip()self.quals = qualscore
s.strip()self.quals_int = [int(q) for q in self.quals.split()]self.pos += 1retur
n Truedef close(self):self.file_pointer.close()def reset(self):self.pos = 0self.
id  = Noneself.quals
>temp_645|source:-*- coding: utf-8 -*-|start:5089|stop:5589
len_distribution) / 20 or 1plt.xticks(range(xtickstep, max_seq_len + 1, xtickste
p), rotation=90, size='xx-small')plt.yticks(range(0, max(seq_len_distribution) +
 1, ytickstep),[y for y in range(0, max(seq_len_distribution) + 1, ytickstep)],s
ize='xx-small')plt.xlim(xmin = 0, xmax = max_seq_len)plt.ylim(ymin = 0, ymax = m
ax(seq_len_distribution) + (max(seq_len_distribution) / 20.0))plt.figtext(0.5, 0
.96, '%s' % (title), weight = 'black', size = 'xx-large', ha = 'center')ax1 = pl
t.subplot(gs[9])plt.
>temp_646|source:-*- coding: utf-8 -*-|start:2499|stop:2999
lf.seq,'count': 1}self.unique_hash_list = [i[1] for i in sorted([(self.unique_ha
sh_dict[hash]['count'], hash)\for hash in self.unique_hash_dict], reverse = True
)]self.total_unique = len(self.unique_hash_dict)self.reset()def next(self):if se
lf.unique:return self.next_unique()else:return self.next_regular()def next_uniqu
e(self):if self.unique:if self.total_unique > 0 and self.pos < self.total_unique
:hash_entry = self.unique_hash_dict[self.unique_hash_list[self.pos]]self.pos += 
1self.seq = hash_ent
>temp_647|source:-*- coding: utf-8 -*-|start:5029|stop:5529
_seq_len / 50) or 1if ytickstep == None:ytickstep = max(seq_len_distribution) / 
20 or 1plt.xticks(range(xtickstep, max_seq_len + 1, xtickstep), rotation=90, siz
e='xx-small')plt.yticks(range(0, max(seq_len_distribution) + 1, ytickstep),[y fo
r y in range(0, max(seq_len_distribution) + 1, ytickstep)],size='xx-small')plt.x
lim(xmin = 0, xmax = max_seq_len)plt.ylim(ymin = 0, ymax = max(seq_len_distribut
ion) + (max(seq_len_distribution) / 20.0))plt.figtext(0.5, 0.96, '%s' % (title),
 weight = 'black', s
>temp_648|source:-*- coding: utf-8 -*-|start:2727|stop:3227
(self):if self.unique:return self.next_unique()else:return self.next_regular()de
f next_unique(self):if self.unique:if self.total_unique > 0 and self.pos < self.
total_unique:hash_entry = self.unique_hash_dict[self.unique_hash_list[self.pos]]
self.pos += 1self.seq = hash_entry['seq'] if self.allow_mixed_case else hash_ent
ry['seq'].upper()self.id  = hash_entry['id']self.ids = hash_entry['ids']return T
rueelse:return Falseelse:return Falsedef next_regular(self):self.seq = Noneself.
id = self.file_point
>temp_649|source:-*- coding: utf-8 -*-|start:4104|stop:4604
lf.pos % 10000 == 0 or self.pos == 1:sys.stderr.write('\r[fastalib] Reading: %s'
 % (self.pos))sys.stderr.flush()sequence_lengths.append(len(self.seq))self.reset
()sys.stderr.write('\n')if not max_seq_len:max_seq_len = max(sequence_lengths) +
 (int(max(sequence_lengths) / 100.0) or 10)seq_len_distribution = [0] * (max_seq
_len + 1)for l in sequence_lengths:seq_len_distribution[l] += 1fig = plt.figure(
figsize = (16, 12))plt.rcParams.update({'axes.linewidth' : 0.9})plt.rc('grid', c
olor='0.50', linesty
>temp_650|source:-*- coding: utf-8 -*-|start:5045|stop:5545
r 1if ytickstep == None:ytickstep = max(seq_len_distribution) / 20 or 1plt.xtick
s(range(xtickstep, max_seq_len + 1, xtickstep), rotation=90, size='xx-small')plt
.yticks(range(0, max(seq_len_distribution) + 1, ytickstep),[y for y in range(0, 
max(seq_len_distribution) + 1, ytickstep)],size='xx-small')plt.xlim(xmin = 0, xm
ax = max_seq_len)plt.ylim(ymin = 0, ymax = max(seq_len_distribution) + (max(seq_
len_distribution) / 20.0))plt.figtext(0.5, 0.96, '%s' % (title), weight = 'black
', size = 'xx-large'
>temp_651|source:-*- coding: utf-8 -*-|start:4506|stop:5006
gsize = (16, 12))plt.rcParams.update({'axes.linewidth' : 0.9})plt.rc('grid', col
or='0.50', linestyle='-', linewidth=0.1)gs = gridspec.GridSpec(10, 1)ax1 = plt.s
ubplot(gs[0:8])plt.grid(True)plt.subplots_adjust(left=0.05, bottom = 0.03, top =
 0.95, right = 0.98)plt.plot(seq_len_distribution, color = 'black', alpha = 0.3)
plt.fill_between(range(0, max_seq_len + 1), seq_len_distribution, y2 = 0, color 
= 'black', alpha = 0.15)plt.ylabel('number of sequences')plt.xlabel('sequence le
ngth')if xtickstep =
>temp_652|source:-*- coding: utf-8 -*-|start:5718|stop:6218
std: %.2f / min: %s / max: %s'\% (len(sequence_lengths),numpy.mean(sequence_leng
ths), numpy.std(sequence_lengths),\min(sequence_lengths),\max(sequence_lengths))
,\va = 'center', alpha = 0.8, size = 'x-large')if dest == None:dest = self.fasta
_file_pathtry:plt.savefig(dest + '.pdf')except:plt.savefig(dest + '.png')try:plt
.show()except:passreturnclass QualSource:def __init__(self, quals_file_path, laz
y_init = True):self.quals_file_path = quals_file_pathself.name = Noneself.lazy_i
nit = lazy_initself.
>temp_653|source:-*- coding: utf-8 -*-|start:2721|stop:3221
f next(self):if self.unique:return self.next_unique()else:return self.next_regul
ar()def next_unique(self):if self.unique:if self.total_unique > 0 and self.pos <
 self.total_unique:hash_entry = self.unique_hash_dict[self.unique_hash_list[self
.pos]]self.pos += 1self.seq = hash_entry['seq'] if self.allow_mixed_case else ha
sh_entry['seq'].upper()self.id  = hash_entry['id']self.ids = hash_entry['ids']re
turn Trueelse:return Falseelse:return Falsedef next_regular(self):self.seq = Non
eself.id = self.file
>temp_654|source:-*- coding: utf-8 -*-|start:921|stop:1421
lf, sequence, piece_length = 80):ticks = range(0, len(sequence), piece_length) +
 [len(sequence)]return '\n'.join([sequence[ticks[x]:ticks[x + 1]] for x in range
(0, len(ticks) - 1)])def close(self):self.output_file_obj.close()class ReadFasta
:def __init__(self, f_name):self.ids = []self.sequences = []self.fasta = Sequenc
eSource(f_name)while self.fasta.next():if self.fasta.pos % 1000 == 0 or self.fas
ta.pos == 1:sys.stderr.write('\r[fastalib] Reading FASTA into memory: %s' % (sel
f.fasta.pos))sys.std
>temp_655|source:-*- coding: utf-8 -*-|start:1998|stop:2498
ile_pointer.seek(0)if self.lazy_init:self.total_seq = Noneelse:self.total_seq = 
len([l for l in self.file_pointer.readlines() if l.startswith('>')])self.reset()
if self.unique:self.init_unique_hash()def init_unique_hash(self):while self.next
_regular():hash = hashlib.sha1(self.seq.upper()).hexdigest()if hash in self.uniq
ue_hash_dict:self.unique_hash_dict[hash]['ids'].append(self.id)self.unique_hash_
dict[hash]['count'] += 1else:self.unique_hash_dict[hash] = {'id' : self.id,'ids'
: [self.id],'seq': s
>temp_656|source:-*- coding: utf-8 -*-|start:4812|stop:5312
, alpha = 0.3)plt.fill_between(range(0, max_seq_len + 1), seq_len_distribution, 
y2 = 0, color = 'black', alpha = 0.15)plt.ylabel('number of sequences')plt.xlabe
l('sequence length')if xtickstep == None:xtickstep = (max_seq_len / 50) or 1if y
tickstep == None:ytickstep = max(seq_len_distribution) / 20 or 1plt.xticks(range
(xtickstep, max_seq_len + 1, xtickstep), rotation=90, size='xx-small')plt.yticks
(range(0, max(seq_len_distribution) + 1, ytickstep),[y for y in range(0, max(seq
_len_distribution) +
>temp_657|source:-*- coding: utf-8 -*-|start:4519|stop:5019
12))plt.rcParams.update({'axes.linewidth' : 0.9})plt.rc('grid', color='0.50', li
nestyle='-', linewidth=0.1)gs = gridspec.GridSpec(10, 1)ax1 = plt.subplot(gs[0:8
])plt.grid(True)plt.subplots_adjust(left=0.05, bottom = 0.03, top = 0.95, right 
= 0.98)plt.plot(seq_len_distribution, color = 'black', alpha = 0.3)plt.fill_betw
een(range(0, max_seq_len + 1), seq_len_distribution, y2 = 0, color = 'black', al
pha = 0.15)plt.ylabel('number of sequences')plt.xlabel('sequence length')if xtic
kstep == None:xticks
>temp_658|source:-*- coding: utf-8 -*-|start:3354|stop:3854
ceself.pos += 1return Trueelse:return Falseif line.startswith('>'):self.file_poi
nter.seek(self.file_pointer.tell() - len(line))breaksequence += line.strip()self
.seq = sequence if self.allow_mixed_case else sequence.upper()self.pos += 1retur
n Truedef get_seq_by_read_id(self, read_id):self.reset()while self.next():if sel
f.id == read_id:return self.seqreturn Falsedef close(self):self.file_pointer.clo
se()def reset(self):self.pos = 0self.id  = Noneself.seq = Noneself.ids = []self.
file_pointer.seek(0)
>temp_659|source:-*- coding: utf-8 -*-|start:824|stop:1324
 split = True):if split:seq = self.split(seq)self.output_file_obj.write('%s\n' %
 seq)def split(self, sequence, piece_length = 80):ticks = range(0, len(sequence)
, piece_length) + [len(sequence)]return '\n'.join([sequence[ticks[x]:ticks[x + 1
]] for x in range(0, len(ticks) - 1)])def close(self):self.output_file_obj.close
()class ReadFasta:def __init__(self, f_name):self.ids = []self.sequences = []sel
f.fasta = SequenceSource(f_name)while self.fasta.next():if self.fasta.pos % 1000
 == 0 or self.fasta.
>temp_660|source:-*- coding: utf-8 -*-|start:3558|stop:4058
e sequence.upper()self.pos += 1return Truedef get_seq_by_read_id(self, read_id):
self.reset()while self.next():if self.id == read_id:return self.seqreturn Falsed
ef close(self):self.file_pointer.close()def reset(self):self.pos = 0self.id  = N
oneself.seq = Noneself.ids = []self.file_pointer.seek(0)def visualize_sequence_l
ength_distribution(self, title, dest = None, max_seq_len = None, xtickstep = Non
e, ytickstep = None):import matplotlib.pyplot as pltimport matplotlib.gridspec a
s gridspecsequence_l
>temp_661|source:-*- coding: utf-8 -*-|start:5970|stop:6470
y:plt.savefig(dest + '.pdf')except:plt.savefig(dest + '.png')try:plt.show()excep
t:passreturnclass QualSource:def __init__(self, quals_file_path, lazy_init = Tru
e):self.quals_file_path = quals_file_pathself.name = Noneself.lazy_init = lazy_i
nitself.pos = 0self.id  = Noneself.quals = Noneself.quals_int = Noneself.ids = [
]self.file_pointer = open(self.quals_file_path)self.file_pointer.seek(0)if self.
lazy_init:self.total_quals = Noneelse:self.total_quals = len([l for l in self.fi
le_pointer.readlines
>temp_662|source:-*- coding: utf-8 -*-|start:6405|stop:6905
se:self.total_quals = len([l for l in self.file_pointer.readlines() if l.startsw
ith('>')])self.reset()def next(self):self.id = self.file_pointer.readline()[1:].
strip()self.quals = Noneself.quals_int = Nonequalscores = ''while 1:line = self.
file_pointer.readline()if not line:if len(qualscores):self.quals = qualscores.st
rip()self.quals_int = [int(q) for q in self.quals.split()]self.pos += 1return Tr
ueelse:return Falseif line.startswith('>'):self.file_pointer.seek(self.file_poin
ter.tell() - len(lin
>temp_663|source:-*- coding: utf-8 -*-|start:1881|stop:2381
hash_dict = {}self.unique_hash_list = []self.unique_next_hash = 0self.file_point
er = open(self.fasta_file_path)self.file_pointer.seek(0)if self.lazy_init:self.t
otal_seq = Noneelse:self.total_seq = len([l for l in self.file_pointer.readlines
() if l.startswith('>')])self.reset()if self.unique:self.init_unique_hash()def i
nit_unique_hash(self):while self.next_regular():hash = hashlib.sha1(self.seq.upp
er()).hexdigest()if hash in self.unique_hash_dict:self.unique_hash_dict[hash]['i
ds'].append(self.id)
>temp_664|source:-*- coding: utf-8 -*-|start:405|stop:905
t_file_path):self.output_file_path = output_file_pathself.output_file_obj = open
(output_file_path, 'w')def store(self, entry, split = True, store_frequencies = 
True):if entry.unique and store_frequencies:self.write_id('%s|%s' % (entry.id, '
frequency:%d' % len(entry.ids)))else:self.write_id(entry.id)self.write_seq(entry
.seq, split)def write_id(self, id):self.output_file_obj.write('>%s\n' % id)def w
rite_seq(self, seq, split = True):if split:seq = self.split(seq)self.output_file
_obj.write('%s\n' % 
>temp_665|source:-*- coding: utf-8 -*-|start:3742|stop:4242
_pointer.close()def reset(self):self.pos = 0self.id  = Noneself.seq = Noneself.i
ds = []self.file_pointer.seek(0)def visualize_sequence_length_distribution(self,
 title, dest = None, max_seq_len = None, xtickstep = None, ytickstep = None):imp
ort matplotlib.pyplot as pltimport matplotlib.gridspec as gridspecsequence_lengt
hs = []self.reset()while self.next():if self.pos % 10000 == 0 or self.pos == 1:s
ys.stderr.write('\r[fastalib] Reading: %s' % (self.pos))sys.stderr.flush()sequen
ce_lengths.append(le
>temp_666|source:-*- coding: utf-8 -*-|start:5243|stop:5743
ion) + 1, ytickstep),[y for y in range(0, max(seq_len_distribution) + 1, ytickst
ep)],size='xx-small')plt.xlim(xmin = 0, xmax = max_seq_len)plt.ylim(ymin = 0, ym
ax = max(seq_len_distribution) + (max(seq_len_distribution) / 20.0))plt.figtext(
0.5, 0.96, '%s' % (title), weight = 'black', size = 'xx-large', ha = 'center')ax
1 = plt.subplot(gs[9])plt.rcParams.update({'axes.edgecolor' : 20})plt.grid(False
)plt.yticks([])plt.xticks([])plt.text(0.02, 0.5, 'total: %s / mean: %.2f / std: 
%.2f / min: %s / max
>temp_667|source:-*- coding: utf-8 -*-|start:745|stop:1245
e_id(self, id):self.output_file_obj.write('>%s\n' % id)def write_seq(self, seq, 
split = True):if split:seq = self.split(seq)self.output_file_obj.write('%s\n' % 
seq)def split(self, sequence, piece_length = 80):ticks = range(0, len(sequence),
 piece_length) + [len(sequence)]return '\n'.join([sequence[ticks[x]:ticks[x + 1]
] for x in range(0, len(ticks) - 1)])def close(self):self.output_file_obj.close(
)class ReadFasta:def __init__(self, f_name):self.ids = []self.sequences = []self
.fasta = SequenceSou
>temp_668|source:-*- coding: utf-8 -*-|start:3052|stop:3552
eq'].upper()self.id  = hash_entry['id']self.ids = hash_entry['ids']return Trueel
se:return Falseelse:return Falsedef next_regular(self):self.seq = Noneself.id = 
self.file_pointer.readline()[1:].strip()sequence = ''while 1:line = self.file_po
inter.readline()if not line:if len(sequence):self.seq = sequenceself.pos += 1ret
urn Trueelse:return Falseif line.startswith('>'):self.file_pointer.seek(self.fil
e_pointer.tell() - len(line))breaksequence += line.strip()self.seq = sequence if
 self.allow_mixed_ca
>temp_669|source:-*- coding: utf-8 -*-|start:3027|stop:3527
d_case else hash_entry['seq'].upper()self.id  = hash_entry['id']self.ids = hash_
entry['ids']return Trueelse:return Falseelse:return Falsedef next_regular(self):
self.seq = Noneself.id = self.file_pointer.readline()[1:].strip()sequence = ''wh
ile 1:line = self.file_pointer.readline()if not line:if len(sequence):self.seq =
 sequenceself.pos += 1return Trueelse:return Falseif line.startswith('>'):self.f
ile_pointer.seek(self.file_pointer.tell() - len(line))breaksequence += line.stri
p()self.seq = sequen
>temp_670|source:-*- coding: utf-8 -*-|start:1662|stop:2162
se = False):self.fasta_file_path = fasta_file_pathself.name = Noneself.lazy_init
 = lazy_initself.allow_mixed_case = allow_mixed_caseself.pos = 0self.id  = Nones
elf.seq = Noneself.ids = []self.unique = uniqueself.unique_hash_dict = {}self.un
ique_hash_list = []self.unique_next_hash = 0self.file_pointer = open(self.fasta_
file_path)self.file_pointer.seek(0)if self.lazy_init:self.total_seq = Noneelse:s
elf.total_seq = len([l for l in self.file_pointer.readlines() if l.startswith('>
')])self.reset()if s
>temp_671|source:-*- coding: utf-8 -*-|start:6474|stop:6974
f l.startswith('>')])self.reset()def next(self):self.id = self.file_pointer.read
line()[1:].strip()self.quals = Noneself.quals_int = Nonequalscores = ''while 1:l
ine = self.file_pointer.readline()if not line:if len(qualscores):self.quals = qu
alscores.strip()self.quals_int = [int(q) for q in self.quals.split()]self.pos +=
 1return Trueelse:return Falseif line.startswith('>'):self.file_pointer.seek(sel
f.file_pointer.tell() - len(line))breakqualscores += ' ' + line.strip()self.qual
s = qualscores.strip
>temp_672|source:-*- coding: utf-8 -*-|start:6649|stop:7149
_pointer.readline()if not line:if len(qualscores):self.quals = qualscores.strip(
)self.quals_int = [int(q) for q in self.quals.split()]self.pos += 1return Trueel
se:return Falseif line.startswith('>'):self.file_pointer.seek(self.file_pointer.
tell() - len(line))breakqualscores += ' ' + line.strip()self.quals = qualscores.
strip()self.quals_int = [int(q) for q in self.quals.split()]self.pos += 1return 
Truedef close(self):self.file_pointer.close()def reset(self):self.pos = 0self.id
  = Noneself.quals =
>temp_673|source:-*- coding: utf-8 -*-|start:423|stop:923
output_file_path = output_file_pathself.output_file_obj = open(output_file_path,
 'w')def store(self, entry, split = True, store_frequencies = True):if entry.uni
que and store_frequencies:self.write_id('%s|%s' % (entry.id, 'frequency:%d' % le
n(entry.ids)))else:self.write_id(entry.id)self.write_seq(entry.seq, split)def wr
ite_id(self, id):self.output_file_obj.write('>%s\n' % id)def write_seq(self, seq
, split = True):if split:seq = self.split(seq)self.output_file_obj.write('%s\n' 
% seq)def split(self
>temp_674|source:-*- coding: utf-8 -*-|start:2795|stop:3295
_regular()def next_unique(self):if self.unique:if self.total_unique > 0 and self
.pos < self.total_unique:hash_entry = self.unique_hash_dict[self.unique_hash_lis
t[self.pos]]self.pos += 1self.seq = hash_entry['seq'] if self.allow_mixed_case e
lse hash_entry['seq'].upper()self.id  = hash_entry['id']self.ids = hash_entry['i
ds']return Trueelse:return Falseelse:return Falsedef next_regular(self):self.seq
 = Noneself.id = self.file_pointer.readline()[1:].strip()sequence = ''while 1:li
ne = self.file_point
>temp_675|source:-*- coding: utf-8 -*-|start:2702|stop:3202
dict)self.reset()def next(self):if self.unique:return self.next_unique()else:ret
urn self.next_regular()def next_unique(self):if self.unique:if self.total_unique
 > 0 and self.pos < self.total_unique:hash_entry = self.unique_hash_dict[self.un
ique_hash_list[self.pos]]self.pos += 1self.seq = hash_entry['seq'] if self.allow
_mixed_case else hash_entry['seq'].upper()self.id  = hash_entry['id']self.ids = 
hash_entry['ids']return Trueelse:return Falseelse:return Falsedef next_regular(s
elf):self.seq = None
>temp_676|source:-*- coding: utf-8 -*-|start:5003|stop:5503
p == None:xtickstep = (max_seq_len / 50) or 1if ytickstep == None:ytickstep = ma
x(seq_len_distribution) / 20 or 1plt.xticks(range(xtickstep, max_seq_len + 1, xt
ickstep), rotation=90, size='xx-small')plt.yticks(range(0, max(seq_len_distribut
ion) + 1, ytickstep),[y for y in range(0, max(seq_len_distribution) + 1, ytickst
ep)],size='xx-small')plt.xlim(xmin = 0, xmax = max_seq_len)plt.ylim(ymin = 0, ym
ax = max(seq_len_distribution) + (max(seq_len_distribution) / 20.0))plt.figtext(
0.5, 0.96, '%s' % (t
>temp_677|source:-*- coding: utf-8 -*-|start:5681|stop:6181
0.02, 0.5, 'total: %s / mean: %.2f / std: %.2f / min: %s / max: %s'\% (len(seque
nce_lengths),numpy.mean(sequence_lengths), numpy.std(sequence_lengths),\min(sequ
ence_lengths),\max(sequence_lengths)),\va = 'center', alpha = 0.8, size = 'x-lar
ge')if dest == None:dest = self.fasta_file_pathtry:plt.savefig(dest + '.pdf')exc
ept:plt.savefig(dest + '.png')try:plt.show()except:passreturnclass QualSource:de
f __init__(self, quals_file_path, lazy_init = True):self.quals_file_path = quals
_file_pathself.name 
>temp_678|source:-*- coding: utf-8 -*-|start:309|stop:809
docs/COPYING file.import sysimport numpyimport hashlibclass FastaOutput:def __in
it__(self, output_file_path):self.output_file_path = output_file_pathself.output
_file_obj = open(output_file_path, 'w')def store(self, entry, split = True, stor
e_frequencies = True):if entry.unique and store_frequencies:self.write_id('%s|%s
' % (entry.id, 'frequency:%d' % len(entry.ids)))else:self.write_id(entry.id)self
.write_seq(entry.seq, split)def write_id(self, id):self.output_file_obj.write('>
%s\n' % id)def write
>temp_679|source:-*- coding: utf-8 -*-|start:4904|stop:5404
r = 'black', alpha = 0.15)plt.ylabel('number of sequences')plt.xlabel('sequence 
length')if xtickstep == None:xtickstep = (max_seq_len / 50) or 1if ytickstep == 
None:ytickstep = max(seq_len_distribution) / 20 or 1plt.xticks(range(xtickstep, 
max_seq_len + 1, xtickstep), rotation=90, size='xx-small')plt.yticks(range(0, ma
x(seq_len_distribution) + 1, ytickstep),[y for y in range(0, max(seq_len_distrib
ution) + 1, ytickstep)],size='xx-small')plt.xlim(xmin = 0, xmax = max_seq_len)pl
t.ylim(ymin = 0, yma
>temp_680|source:-*- coding: utf-8 -*-|start:319|stop:819
NG file.import sysimport numpyimport hashlibclass FastaOutput:def __init__(self,
 output_file_path):self.output_file_path = output_file_pathself.output_file_obj 
= open(output_file_path, 'w')def store(self, entry, split = True, store_frequenc
ies = True):if entry.unique and store_frequencies:self.write_id('%s|%s' % (entry
.id, 'frequency:%d' % len(entry.ids)))else:self.write_id(entry.id)self.write_seq
(entry.seq, split)def write_id(self, id):self.output_file_obj.write('>%s\n' % id
)def write_seq(self,
>temp_681|source:-*- coding: utf-8 -*-|start:432|stop:932
le_path = output_file_pathself.output_file_obj = open(output_file_path, 'w')def 
store(self, entry, split = True, store_frequencies = True):if entry.unique and s
tore_frequencies:self.write_id('%s|%s' % (entry.id, 'frequency:%d' % len(entry.i
ds)))else:self.write_id(entry.id)self.write_seq(entry.seq, split)def write_id(se
lf, id):self.output_file_obj.write('>%s\n' % id)def write_seq(self, seq, split =
 True):if split:seq = self.split(seq)self.output_file_obj.write('%s\n' % seq)def
 split(self, sequenc
>temp_682|source:-*- coding: utf-8 -*-|start:1140|stop:1640
lose()class ReadFasta:def __init__(self, f_name):self.ids = []self.sequences = [
]self.fasta = SequenceSource(f_name)while self.fasta.next():if self.fasta.pos % 
1000 == 0 or self.fasta.pos == 1:sys.stderr.write('\r[fastalib] Reading FASTA in
to memory: %s' % (self.fasta.pos))sys.stderr.flush()self.ids.append(self.fasta.i
d)self.sequences.append(self.fasta.seq)sys.stderr.write('\n')def close(self):sel
f.fasta.close()class SequenceSource():def __init__(self, fasta_file_path, lazy_i
nit = True, unique =
>temp_683|source:-*- coding: utf-8 -*-|start:6620|stop:7120
 = ''while 1:line = self.file_pointer.readline()if not line:if len(qualscores):s
elf.quals = qualscores.strip()self.quals_int = [int(q) for q in self.quals.split
()]self.pos += 1return Trueelse:return Falseif line.startswith('>'):self.file_po
inter.seek(self.file_pointer.tell() - len(line))breakqualscores += ' ' + line.st
rip()self.quals = qualscores.strip()self.quals_int = [int(q) for q in self.quals
.split()]self.pos += 1return Truedef close(self):self.file_pointer.close()def re
set(self):self.pos =
>temp_684|source:-*- coding: utf-8 -*-|start:1118|stop:1618
self.output_file_obj.close()class ReadFasta:def __init__(self, f_name):self.ids 
= []self.sequences = []self.fasta = SequenceSource(f_name)while self.fasta.next(
):if self.fasta.pos % 1000 == 0 or self.fasta.pos == 1:sys.stderr.write('\r[fast
alib] Reading FASTA into memory: %s' % (self.fasta.pos))sys.stderr.flush()self.i
ds.append(self.fasta.id)self.sequences.append(self.fasta.seq)sys.stderr.write('\
n')def close(self):self.fasta.close()class SequenceSource():def __init__(self, f
asta_file_path, lazy
>temp_685|source:-*- coding: utf-8 -*-|start:5732|stop:6232
n: %s / max: %s'\% (len(sequence_lengths),numpy.mean(sequence_lengths), numpy.st
d(sequence_lengths),\min(sequence_lengths),\max(sequence_lengths)),\va = 'center
', alpha = 0.8, size = 'x-large')if dest == None:dest = self.fasta_file_pathtry:
plt.savefig(dest + '.pdf')except:plt.savefig(dest + '.png')try:plt.show()except:
passreturnclass QualSource:def __init__(self, quals_file_path, lazy_init = True)
:self.quals_file_path = quals_file_pathself.name = Noneself.lazy_init = lazy_ini
tself.pos = 0self.id
>temp_686|source:-*- coding: utf-8 -*-|start:6706|stop:7206
als = qualscores.strip()self.quals_int = [int(q) for q in self.quals.split()]sel
f.pos += 1return Trueelse:return Falseif line.startswith('>'):self.file_pointer.
seek(self.file_pointer.tell() - len(line))breakqualscores += ' ' + line.strip()s
elf.quals = qualscores.strip()self.quals_int = [int(q) for q in self.quals.split
()]self.pos += 1return Truedef close(self):self.file_pointer.close()def reset(se
lf):self.pos = 0self.id  = Noneself.quals = Noneself.quals_int = Noneself.ids = 
[]self.file_pointer.
>temp_687|source:-*- coding: utf-8 -*-|start:1262|stop:1762
self.fasta.next():if self.fasta.pos % 1000 == 0 or self.fasta.pos == 1:sys.stder
r.write('\r[fastalib] Reading FASTA into memory: %s' % (self.fasta.pos))sys.stde
rr.flush()self.ids.append(self.fasta.id)self.sequences.append(self.fasta.seq)sys
.stderr.write('\n')def close(self):self.fasta.close()class SequenceSource():def 
__init__(self, fasta_file_path, lazy_init = True, unique = False, allow_mixed_ca
se = False):self.fasta_file_path = fasta_file_pathself.name = Noneself.lazy_init
 = lazy_initself.all
>temp_688|source:-*- coding: utf-8 -*-|start:3082|stop:3582
try['id']self.ids = hash_entry['ids']return Trueelse:return Falseelse:return Fal
sedef next_regular(self):self.seq = Noneself.id = self.file_pointer.readline()[1
:].strip()sequence = ''while 1:line = self.file_pointer.readline()if not line:if
 len(sequence):self.seq = sequenceself.pos += 1return Trueelse:return Falseif li
ne.startswith('>'):self.file_pointer.seek(self.file_pointer.tell() - len(line))b
reaksequence += line.strip()self.seq = sequence if self.allow_mixed_case else se
quence.upper()self.p
>temp_689|source:-*- coding: utf-8 -*-|start:6669|stop:7169
f not line:if len(qualscores):self.quals = qualscores.strip()self.quals_int = [i
nt(q) for q in self.quals.split()]self.pos += 1return Trueelse:return Falseif li
ne.startswith('>'):self.file_pointer.seek(self.file_pointer.tell() - len(line))b
reakqualscores += ' ' + line.strip()self.quals = qualscores.strip()self.quals_in
t = [int(q) for q in self.quals.split()]self.pos += 1return Truedef close(self):
self.file_pointer.close()def reset(self):self.pos = 0self.id  = Noneself.quals =
 Noneself.quals_int 
>temp_690|source:-*- coding: utf-8 -*-|start:2863|stop:3363
> 0 and self.pos < self.total_unique:hash_entry = self.unique_hash_dict[self.uni
que_hash_list[self.pos]]self.pos += 1self.seq = hash_entry['seq'] if self.allow_
mixed_case else hash_entry['seq'].upper()self.id  = hash_entry['id']self.ids = h
ash_entry['ids']return Trueelse:return Falseelse:return Falsedef next_regular(se
lf):self.seq = Noneself.id = self.file_pointer.readline()[1:].strip()sequence = 
''while 1:line = self.file_pointer.readline()if not line:if len(sequence):self.s
eq = sequenceself.po
>temp_691|source:-*- coding: utf-8 -*-|start:2031|stop:2531
nit:self.total_seq = Noneelse:self.total_seq = len([l for l in self.file_pointer
.readlines() if l.startswith('>')])self.reset()if self.unique:self.init_unique_h
ash()def init_unique_hash(self):while self.next_regular():hash = hashlib.sha1(se
lf.seq.upper()).hexdigest()if hash in self.unique_hash_dict:self.unique_hash_dic
t[hash]['ids'].append(self.id)self.unique_hash_dict[hash]['count'] += 1else:self
.unique_hash_dict[hash] = {'id' : self.id,'ids': [self.id],'seq': self.seq,'coun
t': 1}self.unique_ha
>temp_692|source:-*- coding: utf-8 -*-|start:1138|stop:1638
.close()class ReadFasta:def __init__(self, f_name):self.ids = []self.sequences =
 []self.fasta = SequenceSource(f_name)while self.fasta.next():if self.fasta.pos 
% 1000 == 0 or self.fasta.pos == 1:sys.stderr.write('\r[fastalib] Reading FASTA 
into memory: %s' % (self.fasta.pos))sys.stderr.flush()self.ids.append(self.fasta
.id)self.sequences.append(self.fasta.seq)sys.stderr.write('\n')def close(self):s
elf.fasta.close()class SequenceSource():def __init__(self, fasta_file_path, lazy
_init = True, unique
>temp_693|source:-*- coding: utf-8 -*-|start:5068|stop:5568
:ytickstep = max(seq_len_distribution) / 20 or 1plt.xticks(range(xtickstep, max_
seq_len + 1, xtickstep), rotation=90, size='xx-small')plt.yticks(range(0, max(se
q_len_distribution) + 1, ytickstep),[y for y in range(0, max(seq_len_distributio
n) + 1, ytickstep)],size='xx-small')plt.xlim(xmin = 0, xmax = max_seq_len)plt.yl
im(ymin = 0, ymax = max(seq_len_distribution) + (max(seq_len_distribution) / 20.
0))plt.figtext(0.5, 0.96, '%s' % (title), weight = 'black', size = 'xx-large', h
a = 'center')ax1 = p
>temp_694|source:-*- coding: utf-8 -*-|start:3894|stop:4394
on(self, title, dest = None, max_seq_len = None, xtickstep = None, ytickstep = N
one):import matplotlib.pyplot as pltimport matplotlib.gridspec as gridspecsequen
ce_lengths = []self.reset()while self.next():if self.pos % 10000 == 0 or self.po
s == 1:sys.stderr.write('\r[fastalib] Reading: %s' % (self.pos))sys.stderr.flush
()sequence_lengths.append(len(self.seq))self.reset()sys.stderr.write('\n')if not
 max_seq_len:max_seq_len = max(sequence_lengths) + (int(max(sequence_lengths) / 
100.0) or 10)seq_len
>temp_695|source:-*- coding: utf-8 -*-|start:1899|stop:2399
.unique_hash_list = []self.unique_next_hash = 0self.file_pointer = open(self.fas
ta_file_path)self.file_pointer.seek(0)if self.lazy_init:self.total_seq = Noneels
e:self.total_seq = len([l for l in self.file_pointer.readlines() if l.startswith
('>')])self.reset()if self.unique:self.init_unique_hash()def init_unique_hash(se
lf):while self.next_regular():hash = hashlib.sha1(self.seq.upper()).hexdigest()i
f hash in self.unique_hash_dict:self.unique_hash_dict[hash]['ids'].append(self.i
d)self.unique_hash_d
>temp_696|source:-*- coding: utf-8 -*-|start:2034|stop:2534
:self.total_seq = Noneelse:self.total_seq = len([l for l in self.file_pointer.re
adlines() if l.startswith('>')])self.reset()if self.unique:self.init_unique_hash
()def init_unique_hash(self):while self.next_regular():hash = hashlib.sha1(self.
seq.upper()).hexdigest()if hash in self.unique_hash_dict:self.unique_hash_dict[h
ash]['ids'].append(self.id)self.unique_hash_dict[hash]['count'] += 1else:self.un
ique_hash_dict[hash] = {'id' : self.id,'ids': [self.id],'seq': self.seq,'count':
 1}self.unique_hash_
>temp_697|source:-*- coding: utf-8 -*-|start:6324|stop:6824
ls_file_path)self.file_pointer.seek(0)if self.lazy_init:self.total_quals = Nonee
lse:self.total_quals = len([l for l in self.file_pointer.readlines() if l.starts
with('>')])self.reset()def next(self):self.id = self.file_pointer.readline()[1:]
.strip()self.quals = Noneself.quals_int = Nonequalscores = ''while 1:line = self
.file_pointer.readline()if not line:if len(qualscores):self.quals = qualscores.s
trip()self.quals_int = [int(q) for q in self.quals.split()]self.pos += 1return T
rueelse:return False
>temp_698|source:-*- coding: utf-8 -*-|start:5724|stop:6224
.2f / min: %s / max: %s'\% (len(sequence_lengths),numpy.mean(sequence_lengths), 
numpy.std(sequence_lengths),\min(sequence_lengths),\max(sequence_lengths)),\va =
 'center', alpha = 0.8, size = 'x-large')if dest == None:dest = self.fasta_file_
pathtry:plt.savefig(dest + '.pdf')except:plt.savefig(dest + '.png')try:plt.show(
)except:passreturnclass QualSource:def __init__(self, quals_file_path, lazy_init
 = True):self.quals_file_path = quals_file_pathself.name = Noneself.lazy_init = 
lazy_initself.pos = 
>temp_699|source:-*- coding: utf-8 -*-|start:1758|stop:2258
.allow_mixed_case = allow_mixed_caseself.pos = 0self.id  = Noneself.seq = Nonese
lf.ids = []self.unique = uniqueself.unique_hash_dict = {}self.unique_hash_list =
 []self.unique_next_hash = 0self.file_pointer = open(self.fasta_file_path)self.f
ile_pointer.seek(0)if self.lazy_init:self.total_seq = Noneelse:self.total_seq = 
len([l for l in self.file_pointer.readlines() if l.startswith('>')])self.reset()
if self.unique:self.init_unique_hash()def init_unique_hash(self):while self.next
_regular():hash = ha
