Skip to content

Commit 9bb201f

Browse files
authored
Merge pull request #118 from J35P312/master
TIDDIT 3.8.0
2 parents 17172e6 + 7b40f59 commit 9bb201f

7 files changed

+74
-62
lines changed

README.md

+4
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ tiddit --sv --help
3838
tiddit --cov --help
3939
```
4040

41+
Optionally, the assembly calling may be turned off using the "--skip_assembly" option.
42+
4143
TIDDIT may be installed using bioconda:
4244
```
4345
conda install tiddit
@@ -60,6 +62,7 @@ Where bam is the input bam or cram file. And reference.fasta is the reference fa
6062

6163
TIDDIT may be fine-tuned by altering these optional parameters:
6264

65+
6366
-o output prefix(default=output)
6467
-i paired reads maximum allowed insert size. Pairs aligning on the same chr at a distance higher than this are considered candidates for SV (default= 99.9th percentile of insert size)
6568
-d expected reads orientations, possible values "innie" (-> <-) or "outtie" (<- ->). Default: major orientation within the dataset
@@ -74,6 +77,7 @@ TIDDIT may be fine-tuned by altering these optional parameters:
7477
-s Number of reads to sample when computing library statistics(default=25000000)
7578
-z minimum variant size (default=50), variants smaller than this will not be printed ( z < 10 is not recomended)
7679
--force_ploidy force the ploidy to be set to -n across the entire genome (i.e skip coverage normalisation of chromosomes)
80+
--force_overwrite force the analysis and overwrite any data in the output folder
7781
--n_mask exclude regions from coverage calculation if they contain more than this fraction of N (default = 0.5)
7882
--skip_assembly Skip running local assembly, tiddit will perform worse, but wont require fermi2, bwa, ropebwt and bwa indexed ref
7983
--bwa path to bwa executable file(default=bwa)

setup.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,15 @@
1717
"tiddit/tiddit_coverage.pyx",
1818
"tiddit/tiddit_cluster.pyx",
1919
"tiddit/tiddit_coverage_analysis.pyx",
20+
"tiddit/tiddit_gc.pyx",
2021
"tiddit/tiddit_variant.pyx",
2122
"tiddit/tiddit_contig_analysis.pyx"])
2223
else:
2324
ext_modules = []
2425

2526
setup(
2627
name = 'tiddit',
27-
version = '3.7.0',
28+
version = '3.8.0',
2829

2930

3031
url = "https://github.com/SciLifeLab/TIDDIT",

tiddit/__main__.py

+10-3
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,10 @@
1515
import tiddit.tiddit_cluster as tiddit_cluster
1616
import tiddit.tiddit_variant as tiddit_variant
1717
import tiddit.tiddit_contig_analysis as tiddit_contig_analysis
18+
import tiddit.tiddit_gc as tiddit_gc
1819

1920
def main():
20-
version="3.7.0"
21+
version="3.8.0"
2122
parser = argparse.ArgumentParser("""tiddit-{}""".format(version),add_help=False)
2223
parser.add_argument("--sv" , help="call structural variation", required=False, action="store_true")
2324
parser.add_argument("--cov" , help="generate a coverage bed file", required=False, action="store_true")
@@ -95,6 +96,10 @@ def main():
9596
samfile = pysam.AlignmentFile(bam_file_name, "r",reference_filename=args.ref)
9697

9798
bam_header=samfile.header
99+
chromosomes=[]
100+
101+
for chr in bam_header["SQ"]:
102+
chromosomes.append(chr["SN"])
98103
samfile.close()
99104

100105

@@ -149,8 +154,10 @@ def main():
149154
print("extracted signals in:")
150155
print(t-time.time())
151156

157+
gc_dictionary=tiddit_gc.main(args.ref,chromosomes,args.threads,50,0.5)
158+
152159
t=time.time()
153-
library=tiddit_coverage_analysis.determine_ploidy(coverage_data,contigs,library,args.n,prefix,args.c,args.ref,50,bam_header)
160+
library=tiddit_coverage_analysis.determine_ploidy(coverage_data,contigs,library,args.n,prefix,args.c,args.ref,50,bam_header,gc_dictionary)
154161
print("calculated coverage in:")
155162
print(time.time()-t)
156163

@@ -179,7 +186,7 @@ def main():
179186
f.write(vcf_header+"\n")
180187

181188
t=time.time()
182-
variants=tiddit_variant.main(bam_file_name,sv_clusters,args,library,min_mapq,samples,coverage_data,contig_number,max_ins_len)
189+
variants=tiddit_variant.main(bam_file_name,sv_clusters,args,library,min_mapq,samples,coverage_data,contig_number,max_ins_len,gc_dictionary)
183190
print("analyzed clusters in")
184191
print(time.time()-t)
185192

tiddit/tiddit_coverage_analysis.pyx

+1-50
Original file line numberDiff line numberDiff line change
@@ -6,56 +6,7 @@ import gzip
66

77
import tiddit.tiddit_coverage as tiddit_coverage
88

9-
10-
def get_gc(str reference_fasta,bam_header,bin_size):
11-
12-
gc=tiddit_coverage.create_coverage(bam_header,bin_size)[0]
13-
14-
reference={}
15-
chromosomes=[]
16-
if not reference_fasta.endswith(".gz"):
17-
with open(reference_fasta, 'r') as f:
18-
sequence=f.read()
19-
else:
20-
with gzip.open(reference_fasta, 'r') as f:
21-
sequence=f.read()
22-
23-
24-
#split_reference=sequence.split(">")
25-
split_reference=re.split("\n>|^>", sequence)
26-
del sequence
27-
del split_reference[0]
28-
29-
for chromosome in split_reference:
30-
content=chromosome.split("\n",1)
31-
reference[content[0].strip().split()[0]]=content[1].replace("\n","")
32-
33-
N=set(["N","n"])
34-
GC=set(["G","g","C","c"])
35-
36-
for chromosome in gc:
37-
for i in range(0,len(gc)):
38-
start=i*bin_size
39-
end=(i+1)*bin_size
40-
if end > len(reference[chromosome]):
41-
end=len(reference[chromosome])
42-
43-
N_count=0
44-
GC_count=0
45-
for j in range(start,end):
46-
if reference[chromosome][j] in N:
47-
N_count +=1
48-
49-
#masked bin
50-
if N_count > bin_size/2:
51-
gc[chromosome][i] = -1
52-
53-
return(gc)
54-
55-
56-
def determine_ploidy(dict coverage_data,contigs,dict library,int ploidy,str prefix,c, str reference_fasta,int bin_size,bam_header):
57-
58-
gc=get_gc(reference_fasta,bam_header,bin_size)
9+
def determine_ploidy(dict coverage_data,contigs,dict library,int ploidy,str prefix,c, str reference_fasta,int bin_size,bam_header,gc):
5910

6011
f=open( "{}.ploidies.tab".format(prefix),"w" )
6112
f.write("Chromosome\tPloidy\tPloidy_rounded\tMean_coverage\n")

tiddit/tiddit_gc.pyx

+45
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
import pysam
2+
import numpy
3+
import math
4+
from joblib import Parallel, delayed
5+
6+
def binned_gc(fasta_path,contig,bin_size,n_cutoff):
7+
fasta=pysam.FastaFile(fasta_path)
8+
contig_length=fasta.get_reference_length(contig)
9+
elements=int(math.ceil(contig_length/bin_size))
10+
11+
contig_gc=numpy.zeros(elements,dtype=numpy.int8)
12+
13+
start=0
14+
for i in range(0,elements):
15+
slice=fasta.fetch(contig, start, start+bin_size)
16+
n=0
17+
gc=0
18+
19+
for charachter in slice:
20+
if charachter == "N" or charachter == "n":
21+
n+=1
22+
elif charachter == "C" or charachter == "c" or charachter == "G" or charachter == "g":
23+
gc+=1
24+
25+
if n/bin_size > n_cutoff:
26+
contig_gc[i]=-1
27+
28+
else:
29+
contig_gc[i]=round(100*gc/elements)
30+
31+
start+=bin_size
32+
33+
return([contig,contig_gc])
34+
35+
def main(reference,contigs,threads,bin_size,n_cutoff):
36+
gc_list=Parallel(n_jobs=threads)( delayed(binned_gc)(reference,contig,bin_size,n_cutoff) for contig in contigs)
37+
38+
gc_dictionary={}
39+
for gc in gc_list:
40+
gc_dictionary[gc[0]]=gc[1]
41+
42+
return(gc_dictionary)
43+
44+
45+
#contigs=["1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17"]

tiddit/tiddit_signal.pyx

+1-1
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,7 @@ def main(str bam_file_name,str ref,str prefix,int min_q,int max_ins,str sample_i
232232
bam_header=samfile.header
233233
samfile.close()
234234
cdef int bin_size=50
235-
cdef str file_type=u"wig"
235+
cdef str file_type="wig"
236236
cdef str outfile=prefix+".tiddit_coverage.wig"
237237

238238
cdef long t_tot=0

tiddit/tiddit_variant.pyx

+11-7
Original file line numberDiff line numberDiff line change
@@ -177,11 +177,11 @@ def find_sv_type(chrA,chrB,inverted,non_inverted,args,sample_data,samples,librar
177177
return("DUP:INV",cn)
178178
else:
179179
return("DUP:TANDEM",cn)
180-
elif cn < p:
181-
return("DEL",cn)
182180

183-
elif inverted > non_inverted:
181+
if inverted > non_inverted:
184182
return("INV",cn)
183+
elif cn < p:
184+
return("DEL",cn)
185185
else:
186186
return("BND",cn)
187187

@@ -231,7 +231,7 @@ def sv_filter(sample_data,args,chrA,chrB,posA,posB,max_ins_len,n_discordants,n_s
231231

232232
return(filt)
233233

234-
def define_variant(str chrA, str bam_file_name,dict sv_clusters,args,dict library,int min_mapq,samples,dict coverage_data,contig_number,max_ins_len,contig_seqs):
234+
def define_variant(str chrA, str bam_file_name,dict sv_clusters,args,dict library,int min_mapq,samples,dict coverage_data,contig_number,max_ins_len,contig_seqs,gc):
235235
cdef AlignmentFile samfile = AlignmentFile(bam_file_name, "r",reference_filename=args.ref,index_filename="{}_tiddit/{}.csi".format(args.o,samples[0]))
236236
variants=[]
237237

@@ -270,6 +270,7 @@ def define_variant(str chrA, str bam_file_name,dict sv_clusters,args,dict librar
270270

271271
s=int(math.floor(sv_clusters[chrA][chrB][cluster]["startB"]/50.0))
272272
e=int(math.floor(sv_clusters[chrA][chrB][cluster]["endB"]/50.0))+1
273+
273274
avg_b=numpy.average(coverage_data[chrB][s:e])
274275

275276
if avg_b == 0:
@@ -302,7 +303,10 @@ def define_variant(str chrA, str bam_file_name,dict sv_clusters,args,dict librar
302303
else:
303304
s=int(math.floor(posA/50.0))
304305
e=int(math.floor(posB/50.0))+1
305-
sample_data[sample]["covM"]=numpy.average(coverage_data[chrA][s:e] )
306+
coverage_between=coverage_data[chrA][s:e]
307+
gc_between=gc[chrA][s:e]
308+
309+
sample_data[sample]["covM"]=numpy.average(coverage_between[ gc_between > -1 ] )
306310

307311
inverted=0
308312
non_inverted=0
@@ -528,7 +532,7 @@ def define_variant(str chrA, str bam_file_name,dict sv_clusters,args,dict librar
528532
samfile.close()
529533
return(variants)
530534

531-
def main(str bam_file_name,dict sv_clusters,args,dict library,int min_mapq,samples,dict coverage_data,contig_number,max_ins_len):
535+
def main(str bam_file_name,dict sv_clusters,args,dict library,int min_mapq,samples,dict coverage_data,contig_number,max_ins_len,gc):
532536
contig_seqs={}
533537
new_seq=False
534538
if not args.skip_assembly:
@@ -547,7 +551,7 @@ def main(str bam_file_name,dict sv_clusters,args,dict library,int min_mapq,sampl
547551
for chrB in sv_clusters[chrA]:
548552
variants[chrB]=[]
549553

550-
variants_list=Parallel(n_jobs=args.threads,prefer="threads",timeout=99999)( delayed(define_variant)(chrA,bam_file_name,sv_clusters,args,library,min_mapq,samples,coverage_data,contig_number,max_ins_len,contig_seqs) for chrA in sv_clusters)
554+
variants_list=Parallel(n_jobs=args.threads,prefer="threads",timeout=99999)( delayed(define_variant)(chrA,bam_file_name,sv_clusters,args,library,min_mapq,samples,coverage_data,contig_number,max_ins_len,contig_seqs,gc) for chrA in sv_clusters)
551555

552556
ratios={"fragments_A":[],"fragments_B":[],"reads_A":[],"reads_B":[]}
553557
for v in variants_list:

0 commit comments

Comments
 (0)