Skip to content

Commit 651fb94

Browse files
Merge pull request #12 from colossal-compsci/scratch
Use chicken in vignettes/README fixes #11
2 parents c28c185 + 4ba186d commit 651fb94

14 files changed

+161
-94
lines changed

DESCRIPTION

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Package: tfboot
22
Title: Bootstrapping and statistical analysis for TFBS-disrupting SNPs
3-
Version: 0.1.0
3+
Version: 0.2.0
44
Authors@R:
55
c(person(given = "Stephen",
66
family = "Turner",
@@ -30,14 +30,14 @@ Imports:
3030
tibble,
3131
tidyr
3232
Suggests:
33-
BSgenome.Cfamiliaris.UCSC.canFam3,
33+
BSgenome.Ggallus.UCSC.galGal6,
34+
TxDb.Ggallus.UCSC.galGal6.refGene,
35+
org.Gg.eg.db,
36+
AnnotationDbi,
3437
knitr,
35-
rmarkdown,
36-
TxDb.Cfamiliaris.UCSC.canFam3.ncbiRefSeq
37-
Remotes:
38-
stephenturner/TxDb.Cfamiliaris.UCSC.canFam3.ncbiRefSeq
38+
rmarkdown
3939
VignetteBuilder: knitr
4040
Depends:
41-
R (>= 2.10)
41+
R (>= 4.2.0)
4242
LazyData: true
4343
Language: en-US

R/tfboot.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ get_upstream_snps <- function(snps, txdb, level="genes", ...) {
100100
stop("level must be either 'genes' or 'transcripts'")
101101
}
102102
# Get upstream intervals of those features
103-
upstream <- get_upstream(features, ...)
103+
upstream <- suppressWarnings(get_upstream(features, ...))
104104
# Intersect with SNPs to get SNPs in those upstream regions
105105
upstreamsnps <- plyranges::join_overlap_intersect(snps, upstream)
106106
# Set an attribute to check later if needed

README.md

Lines changed: 49 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -58,36 +58,36 @@ library(tfboot)
5858
``` r
5959
mbres <- vignettedata$mbres
6060
mbres
61-
#> # A tibble: 449 × 10
62-
#> gene_id SNP_id tf pctRef pctAlt scoreRef scoreAlt effect alleleDiff alleleEffectSize
63-
#> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
64-
#> 1 ARHGAP30 38:21390808:T:C ACE2 0.754 0.957 3.73 4.71 strong 0.980 0.199
65-
#> 2 ARHGAP30 38:21394731:T:C ASCL1 0.882 0.774 4.68 4.13 weak -0.558 -0.105
66-
#> 3 ARHGAP30 38:21394731:T:C ASCL1 0.857 0.726 5.75 4.91 strong -0.846 -0.127
67-
#> 4 ARHGAP30 38:21390808:T:C AT5G04390 0.861 0.692 4.29 3.48 strong -0.808 -0.163
68-
#> 5 ARHGAP30 38:21394731:T:C Atoh1 0.997 0.848 6.19 5.29 strong -0.901 -0.145
69-
#> 6 ARHGAP30 38:21390808:T:C BZIP52 0.765 0.860 4.98 5.57 weak 0.591 0.0918
70-
#> 7 ARHGAP30 38:21394731:T:C Bhlha15 0.746 0.853 4.98 5.68 weak 0.696 0.105
71-
#> 8 ARHGAP30 38:21394731:T:C ERF096 0.741 0.927 3.47 4.28 strong 0.811 0.176
72-
#> 9 ARHGAP30 38:21394731:T:C ERF13 0.723 0.903 3.99 4.95 strong 0.966 0.177
73-
#> 10 ARHGAP30 38:21394731:T:C ERF7 0.738 0.942 2.72 3.40 weak 0.678 0.189
74-
#> # ℹ 439 more rows
61+
#> # A tibble: 1,335 × 10
62+
#> gene_id SNP_id tf pctRef pctAlt scoreRef scoreAlt effect alleleDiff alleleEffectSize
63+
#> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
64+
#> 1 100316005 chr33:7472152_G/C ACE2 0.986 0.783 4.85 3.87 strong -0.980 -0.199
65+
#> 2 100316005 chr33:7472176_T/G ADR1 0.759 0.917 3.34 4.01 weak 0.673 0.154
66+
#> 3 100316005 chr33:7471139_A/C AFT2 0.964 0.878 5.50 5.02 weak -0.480 -0.0843
67+
#> 4 100316005 chr33:7472176_T/G AFT2 0.902 0.740 5.15 4.24 strong -0.911 -0.160
68+
#> 5 100316005 chr33:7473834_G/T AFT2 0.879 0.705 5.02 4.04 strong -0.980 -0.172
69+
#> 6 100316005 chr33:7472152_G/C AGL42 0.859 0.693 5.14 4.15 strong -0.986 -0.165
70+
#> 7 100316005 chr33:7471139_A/C AGL55 0.695 0.861 3.97 4.91 strong 0.939 0.165
71+
#> 8 100316005 chr33:7472830_A/T ALX3 0.972 0.776 4.23 3.43 strong -0.806 -0.185
72+
#> 9 100316005 chr33:7473857_T/C ALX3 0.919 0.773 4.02 3.42 weak -0.600 -0.138
73+
#> 10 100316005 chr33:7473583_G/A ARG80 0.731 0.961 3.16 4.15 strong 0.993 0.230
74+
#> # ℹ 1,325 more rows
7575
mball <- vignettedata$mball
7676
mball
77-
#> # A tibble: 34,053 × 10
78-
#> gene_id SNP_id tf pctRef pctAlt scoreRef scoreAlt effect alleleDiff alleleEffectSize
79-
#> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
80-
#> 1 ACKR1 38:22769951:C:T ABF3 0.920 0.733 4.88 3.94 strong -0.947 -0.179
81-
#> 2 ACKR1 38:22769951:C:T ARG80 0.910 0.727 3.93 3.14 strong -0.788 -0.182
82-
#> 3 ACKR1 38:22769951:C:T CAMTA2 0.916 0.703 4.09 3.17 strong -0.921 -0.207
83-
#> 4 ACKR1 38:22769951:C:T CAMTA3 0.940 0.741 4.27 3.38 strong -0.893 -0.197
84-
#> 5 ACKR1 38:22769951:C:T HES1 0.863 0.682 4.90 3.90 strong -0.997 -0.176
85-
#> 6 ACKR1 38:22769951:C:T NHP10 0.881 0.716 5.41 4.43 strong -0.981 -0.160
86-
#> 7 ACKR1 38:22769951:C:T OJ1058_F05.8 0.851 0.701 5.57 4.59 strong -0.974 -0.149
87-
#> 8 ACKR1 38:22769951:C:T SUT1 0.888 0.639 3.61 2.62 strong -0.993 -0.245
88-
#> 9 ACKR1 38:22769951:C:T ZNF816 0.840 0.896 7.97 8.47 weak 0.498 0.0530
89-
#> 10 ACKR1 38:22769951:C:T nuc-1 0.856 0.697 5.42 4.42 strong -0.998 -0.158
90-
#> # ℹ 34,043 more rows
77+
#> # A tibble: 12,689 × 10
78+
#> gene_id SNP_id tf pctRef pctAlt scoreRef scoreAlt effect alleleDiff alleleEffectSize
79+
#> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
80+
#> 1 100315917 chr33:5706617_C/T ARG80 0.979 0.749 4.23 3.24 strong -0.993 -0.230
81+
#> 2 100315917 chr33:5706617_C/T ARG81 0.991 0.825 5.89 4.90 strong -0.982 -0.165
82+
#> 3 100315917 chr33:5703167_G/T ARGFX 0.781 0.889 5.06 5.74 weak 0.677 0.105
83+
#> 4 100315917 chr33:5706617_C/T ARR1 0.634 0.880 2.53 3.42 strong 0.892 0.231
84+
#> 5 100315917 chr33:5707492_C/T AT1G19040 0.882 0.779 7.60 6.73 strong -0.861 -0.100
85+
#> 6 100315917 chr33:5704406_C/G AT3G46070 0.784 0.892 5.13 5.79 weak 0.666 0.103
86+
#> 7 100315917 chr33:5706617_C/T ATHB-12 0.781 0.890 4.12 4.67 weak 0.552 0.106
87+
#> 8 100315917 chr33:5706617_C/T ATHB-13 0.721 0.868 4.98 5.98 strong 0.993 0.145
88+
#> 9 100315917 chr33:5706617_C/T ATHB-16 0.813 0.966 5.32 6.32 strong 0.998 0.153
89+
#> 10 100315917 chr33:5706617_C/T ATHB-4 0.778 0.931 4.98 5.94 strong 0.963 0.151
90+
#> # ℹ 12,679 more rows
9191
```
9292

9393
Summarize motifbreakR results on our 5 genes of interest. This shows us
@@ -101,7 +101,7 @@ mbsmry
101101
#> # A tibble: 1 × 7
102102
#> ngenes nsnps nstrong alleleDiffAbsMean alleleDiffAbsSum alleleEffectSizeAbsMean alleleEffectSizeAbsSum
103103
#> <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
104-
#> 1 5 449 363 0.837 376. 0.157 70.7
104+
#> 1 5 1335 950 0.798 1066. 0.155 206.
105105
```
106106

107107
Bootstrap resample motifbreakR results for all genes. Resample sets of 5
@@ -112,18 +112,18 @@ set.seed(42)
112112
mbboot <- mb_bootstrap(mball, ngenes=5, boots = 250)
113113
mbboot$bootwide
114114
#> # A tibble: 250 × 9
115-
#> boot genes ngenes nsnps nstrong alleleDiffAbsMean alleleDiffAbsSum alleleEffectSizeAbsMean alleleEffectSizeAbsSum
116-
#> <int> <chr> <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
117-
#> 1 1 HSD17B7;SLAMF9;LOC102151674;LOC111094585;LOC102155451 5 1463 1100 0.808 1182. 0.162 236.
118-
#> 2 2 SLAMF7;LOC111094577;LOC111094537;HSD17B7;LOC111094548 5 631 419 0.774 489. 0.159 100.
119-
#> 3 3 GPATCH2;CDK18;LOC102153439;LOC111094501;LOC111094466 5 342 258 0.810 277. 0.157 53.6
120-
#> 4 4 LOC111094614;LOC111094516;CD1E;LOC111094588;LOC111094527 5 789 539 0.785 620. 0.160 126.
121-
#> 5 5 LOC111094522;LOC111094553;FAM177B;LOC111094466;CNTN2 5 608 435 0.800 486. 0.151 91.8
122-
#> 6 6 LOC111094613;SLAMF9;RXRG;LOC111094514;APOA2 5 568 381 0.779 443. 0.147 83.4
123-
#> 7 7 PIGM;LOC111094605;LOC111094471;LOC111094509;AIDA 5 991 706 0.799 792. 0.159 158.
124-
#> 8 8 LEMD1;SH2D1B;FCER1A;CDK18;LOC111094593 5 472 336 0.790 373. 0.159 74.9
125-
#> 9 9 FCER1G;LOC111094573;LOC111094582;LEMD1;LOC111094560 5 685 490 0.802 549. 0.152 104.
126-
#> 10 10 EIF2D;LOC102152982;PPP1R15B;LOC111094577;LOC111094514 5 808 486 0.756 611. 0.148 120.
115+
#> boot genes ngenes nsnps nstrong alleleDiffAbsMean alleleDiffAbsSum alleleEffectSizeAbsMean alleleEffectSizeAbsSum
116+
#> <int> <chr> <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
117+
#> 1 1 431304;426469;100315917;396544;374124 5 1062 750 0.797 847. 0.155 164.
118+
#> 2 2 426183;395959;431304;429501;396170 5 1244 914 0.808 1005. 0.157 195.
119+
#> 3 3 102465361;426183;396544;426469;429172 5 886 655 0.812 720. 0.149 132.
120+
#> 4 4 396007;407779;693245;429501;100498692 5 1514 1139 0.815 1234. 0.153 232.
121+
#> 5 5 426883;396544;408041;426183;426469 5 925 623 0.787 728. 0.147 136.
122+
#> 6 6 425059;429035;100529062;396007;425613 5 1350 1026 0.821 1109. 0.155 209.
123+
#> 7 7 408042;426880;100498692;425499;426885 5 823 627 0.823 677. 0.157 129.
124+
#> 8 8 396170;425058;426886;395587;396045 5 1262 910 0.805 1016. 0.147 185.
125+
#> 9 9 102466833;426183;100529061;396045;395959 5 1263 963 0.822 1038. 0.153 193.
126+
#> 10 10 429035;408042;100529062;100529061;425613 5 1289 987 0.823 1061. 0.154 198.
127127
#> # ℹ 240 more rows
128128
mbboot$bootdist
129129
#> # A tibble: 6 × 2
@@ -144,14 +144,14 @@ distribution from bootstrap resampling.
144144
bootstats <- mb_bootstats(mbsmry, mbboot)
145145
bootstats
146146
#> # A tibble: 6 × 5
147-
#> metric stat bootdist bootmax p
148-
#> <chr> <dbl> <list> <dbl> <dbl>
149-
#> 1 nsnps 449 <dbl [250]> 1646 0.82
150-
#> 2 nstrong 363 <dbl [250]> 1253 0.732
151-
#> 3 alleleDiffAbsMean 0.837 <dbl [250]> 0.845 0.02
152-
#> 4 alleleDiffAbsSum 376. <dbl [250]> 1347. 0.796
153-
#> 5 alleleEffectSizeAbsMean 0.157 <dbl [250]> 0.173 0.556
154-
#> 6 alleleEffectSizeAbsSum 70.7 <dbl [250]> 264. 0.816
147+
#> metric stat bootdist bootmax p
148+
#> <chr> <dbl> <list> <dbl> <dbl>
149+
#> 1 nsnps 1335 <dbl [250]> 1597 0.136
150+
#> 2 nstrong 950 <dbl [250]> 1210 0.188
151+
#> 3 alleleDiffAbsMean 0.798 <dbl [250]> 0.831 0.836
152+
#> 4 alleleDiffAbsSum 1066. <dbl [250]> 1304. 0.148
153+
#> 5 alleleEffectSizeAbsMean 0.155 <dbl [250]> 0.163 0.456
154+
#> 6 alleleEffectSizeAbsSum 206. <dbl [250]> 251. 0.128
155155
```
156156

157157
Visualize the results:

data/vignettedata.rda

-720 KB
Binary file not shown.

inst/extdata/README.sh

Lines changed: 0 additions & 16 deletions
This file was deleted.

inst/extdata/galGal6-chr33.vcf.gz

92.3 KB
Binary file not shown.

inst/extdata/galGal6-chr33.vcf.gz.tbi

3.78 KB
Binary file not shown.

inst/extdata/rosie38.vcf.gz

-212 KB
Binary file not shown.

inst/extdata/rosie38.vcf.gz.tbi

-9.98 KB
Binary file not shown.

inst/extdata/simulate-vcf.R

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
library(BSgenome.Ggallus.UCSC.galGal6)
2+
library(TxDb.Ggallus.UCSC.galGal6.refGene)
3+
library(VariantAnnotation)
4+
5+
set.seed(2023-06-19)
6+
7+
makesnp <- function(x) {
8+
if (x=="N") return("N")
9+
nt <- c("A", "C", "G", "T")
10+
stopifnot(x %in% nt)
11+
nt <- setdiff(nt, x)
12+
sample(nt, 1)
13+
}
14+
15+
# Set the BSGenome
16+
bsgenome <- BSgenome.Ggallus.UCSC.galGal6
17+
18+
# Get the length of each chromosome from the BSGenome object
19+
chromosome_lengths <- seqlengths(bsgenome)
20+
21+
# Define how many SNPs you want to generate per chromosome
22+
num_snps <- 20000 # Change as needed
23+
24+
# Initialize an empty GRanges object to store your simulated SNPs
25+
simulated_snps <- GRanges()
26+
27+
# Loop through each chromosome
28+
for (chr_name in names(chromosome_lengths)) {
29+
30+
# skip everything except chr33
31+
if (chr_name!="chr33") next
32+
33+
# Generate random positions for the SNPs
34+
snp_positions <- sort(sample(10000:(chromosome_lengths[chr_name]-10000), num_snps))
35+
36+
# Generate GRanges object for this chromosome
37+
chr_snps <- GRanges(seqnames = chr_name,
38+
ranges = IRanges(start = snp_positions, width = 1),
39+
strand = "*")
40+
41+
# Add the simulated SNPs for this chromosome to the total
42+
simulated_snps <- c(simulated_snps, chr_snps)
43+
}
44+
45+
# Make the VCF
46+
vcf <- VCF(rowRanges = simulated_snps,
47+
colData=DataFrame(Samples=1L, row.names = "sample1"),
48+
exptData=list(header=VCFHeader()))
49+
50+
# Set ref alleles (DNAStringSet)
51+
ref(vcf) <- getSeq(bsgenome, simulated_snps)
52+
53+
# Set ref alleles (DNAStringSetList)
54+
alt(vcf) <-
55+
ref(vcf) |>
56+
as.character() |>
57+
sapply(makesnp) |>
58+
unname() |>
59+
VariantAnnotation:::.toDNAStringSetList()
60+
61+
# Set the genotypes
62+
assays(vcf, withDimnames=FALSE) <- list(GT=matrix(rep("1/1", length(vcf)), ncol=1, dimnames = list(NULL, "sample1")))
63+
64+
65+
# Write the VCF
66+
vcffile <- here::here("inst/extdata/galGal6-chr33.vcf")
67+
writeVcf(vcf, vcffile)
68+
system(paste0("head -n10 ", vcffile))
69+
70+
# Make it spec compliant
71+
system(paste0("gsed -i '1 i##fileformat=VCFv4.3' ", vcffile))
72+
system(paste0("bgzip -f ", vcffile))
73+
system(paste0("tabix -f ", vcffile, ".gz"))
74+
system(paste0("bcftools view ", vcffile, ".gz | head -n10"))
75+

inst/extdata/todd38.vcf.gz

-211 KB
Binary file not shown.

inst/extdata/todd38.vcf.gz.tbi

-10.2 KB
Binary file not shown.
3.44 KB
Loading

0 commit comments

Comments
 (0)