Skip to content

Funcotator - Adding cDNA Strings for Intronic Variants #5321

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Oct 25, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1649,6 +1649,90 @@ public static String convertHG19ContigToB37Contig( final String hg19Contig ) {
return HG19_TO_B37_CONTIG_NAME_MAP.getOrDefault(hg19Contig, hg19Contig);
}

/**
* Create a cDNA string for an intronic variant.
* The cDNA string contains information about the coding position of a variant and the alleles involved.
* If the transcript in which the variant occurs contains at least 1 exon, the string will be non-empty and of
* the form:
* c.e[EXON NUMBER][+|-][BASES FROM EXON][REF ALLELE]>[ALT ALLELE]
* Concretely:
* c.e2-1A>G
* Where:
* 2 = the number of the exon to which the given variant start is closest
* -1 = number of bases away from the exon (1 before)
* A = Reference allele
* G = Alternate allele
* @param variantStart The start position (1-based, inclusive) in genomic coordinates of the variant.
* @param exonList The {@link List<Locatable>} representing the exons in the transcript in which this variant occurs. Must not be {@code null}.
* @param strandCorrectedRefAllele A {@link String} containing the bases of the reference allele, which are correct for strandedness (i.e. if on a transcript on the {@link Strand#NEGATIVE} strand, the string has already been reverse-complemented). Must not be {@code null}.
* @param strandCorrectedAltAllele A {@link String} containing the bases of the alternate allele, which are correct for strandedness (i.e. if on a transcript on the {@link Strand#NEGATIVE} strand, the string has already been reverse-complemented). Must not be {@code null}.
* @return A {@link String} representing the cDNA change for the given data. Will be empty if the given {@code exonList} is empty.
*/
public static String createIntronicCDnaString(final int variantStart,
final List<? extends Locatable> exonList,
final String strandCorrectedRefAllele,
final String strandCorrectedAltAllele) {

Utils.nonNull(exonList);
Utils.nonNull(strandCorrectedRefAllele);
Utils.nonNull(strandCorrectedAltAllele);

// Get the exon that is closest to our variant:
final int exonIndex = getClosestExonIndex(variantStart, exonList);

if ( exonIndex != -1 ) {
final Locatable closestExon = exonList.get(exonIndex);

final int startDiff = variantStart - closestExon.getStart();
final int endDiff = variantStart - closestExon.getEnd();

// Get the offset from our start:
final int exonOffset;
if ( Math.abs(startDiff) <= Math.abs(endDiff) ) {
exonOffset = startDiff;
}
else {
exonOffset = endDiff;
}

// Get the cDNA string itself:
return "c.e" + (exonIndex+1) + (exonOffset < 0 ? "-" : "+") + Math.abs(exonOffset) + strandCorrectedRefAllele + ">" + strandCorrectedAltAllele;
}
else {
return "NA";
}
}


/**
* Get the index of the exon that is closest to the given start position of a variant.
* Checks both before and after the start position to get the closest exon such that it may occur before the
* variant, within the variant, or after the variant.
* If there are no exons in the transcript, will return {@code null}.
* @param variantStartPos Start position (1-based, inclusive) in genomic coordinates of a variant.
* @param exonList The {@link List<Locatable>} representing the exons in the transcript in which this variant occurs. Must not be {@code null}.
* @return The index into the given {@code exonList} corresponding to the entry which is the fewest bases away from the given variant positions. If {@code exonList} is empty, will return {@code -1}.
*/
public static int getClosestExonIndex( final int variantStartPos,
final List<? extends Locatable> exonList) {
Utils.nonNull(exonList);

int exonIndex = -1;
int distFromVariant = Integer.MAX_VALUE;

for ( int i = 0; i < exonList.size() ; ++i ) {
for ( final int exonPos : Arrays.asList(exonList.get(i).getStart(), exonList.get(i).getEnd()) ) {
final int dist = Math.abs(variantStartPos - exonPos);
if ( dist < distFromVariant ) {
exonIndex = i;
distFromVariant = dist;
}
}
}

return exonIndex;
}

/**
* Get the overlapping exon start/stop as a {@link SimpleInterval} for the given altAllele / reference.
* @param refAllele {@link Allele} for the given {@code altAllele}. Must not be {@code null}.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1352,6 +1352,16 @@ private GencodeFuncotation createIntronFuncotation(final VariantContext variant,
);
}

// Set our cDNA string:
gencodeFuncotationBuilder.setcDnaChange(
FuncotatorUtils.createIntronicCDnaString(
variant.getStart(),
transcript.getExons(),
strandCorrectedRefAllele.getBaseString(),
strandCorrectedAltAllele.getBaseString()
)
);

// Set our version:
gencodeFuncotationBuilder.setVersion(version);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,16 @@ private static Object[] helpCreateDataForTestGetBasesInWindowAroundReferenceAlle
};
}

private static Object[] helpProvideForCreateIntronicCDnaString(final int start, final List<Locatable> exonList, final String ref, final String alt, final Integer index, final Integer dist) {
if ( index == null ) {
return new Object[]{ start, exonList, ref, alt, "NA" };
}
else {
return new Object[]{ start, exonList, ref, alt, "c.e" + (index+1) + (dist > 0 ? '+' : '-') + Math.abs(dist) + ref + '>' + alt };
}
}


//==================================================================================================================
// Data Providers:

Expand Down Expand Up @@ -917,6 +927,69 @@ Object[][] provideDataForRenderProteinChangeString() {
};
}

@DataProvider
Object[][] provideForGetClosestExonIndex() {

final List<Locatable> exonList = new ArrayList<>();
exonList.add( new SimpleInterval("testContig", 20, 30) );
exonList.add( new SimpleInterval("testContig", 40, 50) );
exonList.add( new SimpleInterval("testContig", 60, 70) );
exonList.add( new SimpleInterval("testContig", 80, 90) );
exonList.add( new SimpleInterval("testContig", 10000, 15000) );
exonList.add( new SimpleInterval("testContig", 10000000, 15000000) );

return new Object[][] {
// No exons:
{ 8, Collections.emptyList(), -1 },
// Start before first exon start:
{ 8, exonList, 0 },
// Start inside first exon:
{ 25, exonList, 0 },
// Start after first exon end:
{ 31, exonList, 0 },
// Start after exon start, just before a "break point":
{ 35, exonList, 0 },
// Start between exons, just after a "break point":
{ 36, exonList, 1 },
// Start between exons:
{ 500, exonList, 3 },
// Start after last exon:
{ 50000000, exonList, 5 },
};
}

@DataProvider
Object[][] provideForCreateIntronicCDnaString() {

final List<Locatable> exonList = new ArrayList<>();
exonList.add( new SimpleInterval("testContig", 20, 30) );
exonList.add( new SimpleInterval("testContig", 40, 50) );
exonList.add( new SimpleInterval("testContig", 60, 70) );
exonList.add( new SimpleInterval("testContig", 80, 90) );
exonList.add( new SimpleInterval("testContig", 10000, 15000) );
exonList.add( new SimpleInterval("testContig", 10000000, 15000000) );

return new Object[][] {
// No exons:
helpProvideForCreateIntronicCDnaString( 8, Collections.emptyList(), "A", "C", null, null ),
// Start before first exon start:
helpProvideForCreateIntronicCDnaString( 8, exonList,"A", "T", 0, -12 ),
// Start inside first exon:
helpProvideForCreateIntronicCDnaString( 25, exonList,"C", "T", 0, 5 ),
// Start after first exon end:
helpProvideForCreateIntronicCDnaString( 31, exonList, "G", "T", 0, 1 ),
// Start after exon start, just before a "break point":
helpProvideForCreateIntronicCDnaString( 35, exonList, "ATG", "GCT", 0, 5),
// Start between exons, just after a "break point":
helpProvideForCreateIntronicCDnaString( 36, exonList, "ATG", "GCT", 1, -4),
// Start between exons:
helpProvideForCreateIntronicCDnaString( 500, exonList, "TTTTTTTTT", "AAAAAAAAA", 3, 410),
// Start after last exon:
helpProvideForCreateIntronicCDnaString( 50000000, exonList, "A", "GTG",5, 35000000 ),

};
}

@DataProvider
Object[][] provideForTestGetCodingSequenceChangeString() {

Expand Down Expand Up @@ -1888,6 +1961,22 @@ void testRenderProteinChangeString(final int protChangeStartPos,
Assert.assertEquals( FuncotatorUtils.renderProteinChangeString(seqComp, startCodon), expected );
}

@Test(dataProvider = "provideForGetClosestExonIndex")
void testGetClosestExonIndex( final int variantStartPos,
final List<? extends Locatable> exonList,
final int expected ) {
Assert.assertEquals( FuncotatorUtils.getClosestExonIndex(variantStartPos, exonList), expected );
}

@Test(dataProvider = "provideForCreateIntronicCDnaString")
void testCreateIntronicCDnaString(final int variantStartPos,
final List<? extends Locatable> exonList,
final String strandCorrectedRefAllele,
final String strandCorrectedAltAllele,
final String expected ) {
Assert.assertEquals( FuncotatorUtils.createIntronicCDnaString(variantStartPos, exonList, strandCorrectedRefAllele, strandCorrectedAltAllele), expected );
}

@Test(dataProvider = "provideForTestGetCodingSequenceChangeString")
void testGetCodingSequenceChangeString(final int codingSequenceAlleleStart,
final String referenceAllele,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,18 @@ static List<Object[]> provideIndelDataForMuc16() {

// ==============================
// Test cases for issue 5050:
new Object[]{ "MUC16", 19, 8966646, 8966648, GencodeFuncotation.VariantClassification.SPLICE_SITE, GencodeFuncotation.VariantType.DEL, "TTA", "T", "g.chr19:8966647_8966648delTA", "-", null, "c.e81+5", null },
new Object[]{ "MUC16", 19, 8966818, 8966820, GencodeFuncotation.VariantClassification.INTRON, GencodeFuncotation.VariantType.DEL, "TGG", "T", "g.chr19:8966819_8966820delGG", "-", null, null, null },
new Object[]{ "MUC16", 19, 8966646, 8966648, GencodeFuncotation.VariantClassification.SPLICE_SITE, GencodeFuncotation.VariantType.DEL, "TTA", "T", "g.chr19:8966647_8966648delTA", "-", "c.e81-4TAA>A", "c.e81+5", null },
new Object[]{ "MUC16", 19, 8966818, 8966820, GencodeFuncotation.VariantClassification.INTRON, GencodeFuncotation.VariantType.DEL, "TGG", "T", "g.chr19:8966819_8966820delGG", "-", "c.e81+2CCA>A", null, null },
new Object[]{ "MUC16", 19, 8966813, 8966815, GencodeFuncotation.VariantClassification.SPLICE_SITE, GencodeFuncotation.VariantType.DEL, "AGA", "A", "g.chr19:8966814_8966815delGA", "-", "c.43138_43139delTC", "c.(43138-43140)tctfs", "p.S14380fs" },
new Object[]{ "MUC16", 19, 8966651, 8966653, GencodeFuncotation.VariantClassification.FRAME_SHIFT_DEL, GencodeFuncotation.VariantType.DEL, "ATC", "A", "g.chr19:8966652_8966653delTC", "-", "c.43300_43301delGA", "c.(43300-43302)gatfs", "p.D14434fs" },

new Object[]{ "MUC16", 19, 8966647, 8966647, GencodeFuncotation.VariantClassification.INTRON, GencodeFuncotation.VariantType.INS, "T", "TAG", "g.chr19:8966647_8966648insAG", "-", null, null, null },
new Object[]{ "MUC16", 19, 8966818, 8966818, GencodeFuncotation.VariantClassification.INTRON, GencodeFuncotation.VariantType.INS, "T", "TAG", "g.chr19:8966818_8966819insAG", "-", null, null, null },
new Object[]{ "MUC16", 19, 8966647, 8966647, GencodeFuncotation.VariantClassification.INTRON, GencodeFuncotation.VariantType.INS, "T", "TAG", "g.chr19:8966647_8966648insAG", "-", "c.e81-3A>CTA", null, null },
new Object[]{ "MUC16", 19, 8966818, 8966818, GencodeFuncotation.VariantClassification.INTRON, GencodeFuncotation.VariantType.INS, "T", "TAG", "g.chr19:8966818_8966819insAG", "-", "c.e81+2A>CTA", null, null },
new Object[]{ "MUC16", 19, 8966814, 8966814, GencodeFuncotation.VariantClassification.FRAME_SHIFT_INS, GencodeFuncotation.VariantType.INS, "G", "GTC", "g.chr19:8966814_8966815insTC", "-", "c.43138_43139insGA", "c.(43138-43140)tctfs", "p.S14380fs" },
new Object[]{ "MUC16", 19, 8966651, 8966651, GencodeFuncotation.VariantClassification.FRAME_SHIFT_INS, GencodeFuncotation.VariantType.INS, "A", "ATG", "g.chr19:8966651_8966652insTG", "-", "c.43301_43302insCA", "c.(43300-43302)gatfs", "p.G14435fs" },

new Object[]{ "MUC16", 19, 8966648, 8966648, GencodeFuncotation.VariantClassification.SPLICE_SITE, GencodeFuncotation.VariantType.INS, "A", "AC", "g.chr19:8966648_8966649insC", "-", null, "c.e81+2", null },
new Object[]{ "MUC16", 19, 8966817, 8966817, GencodeFuncotation.VariantClassification.SPLICE_SITE, GencodeFuncotation.VariantType.INS, "C", "CT", "g.chr19:8966817_8966818insT", "-", null, "c.e81-1", null },
new Object[]{ "MUC16", 19, 8966648, 8966648, GencodeFuncotation.VariantClassification.SPLICE_SITE, GencodeFuncotation.VariantType.INS, "A", "AC", "g.chr19:8966648_8966649insC", "-", "c.e81-2T>GT", "c.e81+2", null },
new Object[]{ "MUC16", 19, 8966817, 8966817, GencodeFuncotation.VariantClassification.SPLICE_SITE, GencodeFuncotation.VariantType.INS, "C", "CT", "g.chr19:8966817_8966818insT", "-", "c.e81+1G>AG", "c.e81-1", null },
new Object[]{ "MUC16", 19, 8966815, 8966815, GencodeFuncotation.VariantClassification.SPLICE_SITE, GencodeFuncotation.VariantType.INS, "A", "AT", "g.chr19:8966815_8966816insT", "-", "c.43137_43138insA", "c.(43138-43140)tctfs", "p.S14380fs" },
new Object[]{ "MUC16", 19, 8966650, 8966650, GencodeFuncotation.VariantClassification.SPLICE_SITE, GencodeFuncotation.VariantType.INS, "C", "CG", "g.chr19:8966650_8966651insG", "-", "c.43302_43303insC", "c.(43303-43305)gggfs", "p.G14435fs" },
// ==============================
Expand Down
Loading