Skip to content

Raredisease add QC check for contamination and adapters #4183

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Feb 6, 2025
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions cg/constants/nf_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,14 @@ class NfTowerStatus(StrEnum):
"median_coverage": {"norm": "gt", "threshold": 25},
}

RAREDISEASE_ADAPTER_BASES_PERCENTAGE_THRESHOLD = 0.005

RAREDISEASE_PREDICTED_SEX_METRIC = "predicted_sex_sex_check"

RAREDISEASE_METRIC_CONDITIONS_WES: dict[str, dict[str, Any]] = {
"percent_duplicates": {"norm": "lt", "threshold": 20},
"Contamination Status": {"norm": "eq", "threshold": "NO"},
"adapter_cutting_adapter_trimmed_reads": {"norm": "lt", "threshold": None},
"PCT_PF_UQ_READS_ALIGNED": {"norm": "gt", "threshold": 0.95},
"MEDIAN_TARGET_COVERAGE": {"norm": "gt", "threshold": 25},
"PCT_TARGET_BASES_10X": {"norm": "gt", "threshold": 0.95},
Expand All @@ -35,6 +39,8 @@ class NfTowerStatus(StrEnum):

RAREDISEASE_METRIC_CONDITIONS_WGS: dict[str, dict[str, Any]] = {
"percent_duplicates": {"norm": "lt", "threshold": 20},
"Contamination Status": {"norm": "eq", "threshold": "NO"},
"adapter_cutting_adapter_trimmed_reads": {"norm": "lt", "threshold": None},
"PCT_PF_UQ_READS_ALIGNED": {"norm": "gt", "threshold": 0.95},
"MEDIAN_TARGET_COVERAGE": {"norm": "gt", "threshold": 25},
"PCT_TARGET_BASES_10X": {"norm": "gt", "threshold": 0.95},
Expand Down
2 changes: 2 additions & 0 deletions cg/constants/sequencing.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,3 +79,5 @@ class SeqLibraryPrepCategory(StrEnum):
SeqLibraryPrepCategory.TARGETED_GENOME_SEQUENCING,
SeqLibraryPrepCategory.WHOLE_EXOME_SEQUENCING,
]

NOVASEQ_SEQUENCING_READ_LENGTH = 151
16 changes: 15 additions & 1 deletion cg/meta/workflow/raredisease.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,10 @@
RAREDISEASE_PARENT_PEDDY_METRIC_CONDITION,
RAREDISEASE_METRIC_CONDITIONS_WGS,
RAREDISEASE_METRIC_CONDITIONS_WES,
RAREDISEASE_ADAPTER_BASES_PERCENTAGE_THRESHOLD,
)
from cg.constants.scout import RAREDISEASE_CASE_TAGS, ScoutExportFileName
from cg.constants.sequencing import SeqLibraryPrepCategory
from cg.constants.sequencing import SeqLibraryPrepCategory, NOVASEQ_SEQUENCING_READ_LENGTH
from cg.constants.subject import PlinkPhenotypeStatus, PlinkSex
from cg.constants.tb import AnalysisType
from cg.meta.workflow.nf_analysis import NfAnalysisAPI
Expand Down Expand Up @@ -176,6 +177,7 @@ def get_workflow_metrics(self, sample_id: str) -> dict:
self.get_metric_conditions_by_prep_category(sample_id=sample.internal_id)
)
self.set_order_sex_for_sample(sample, metric_conditions)
self.set_adapter_bases_for_sample(sample, metric_conditions)
else:
metric_conditions = RAREDISEASE_PARENT_PEDDY_METRIC_CONDITION.copy()
return metric_conditions
Expand Down Expand Up @@ -243,6 +245,18 @@ def set_order_sex_for_sample(sample: Sample, metric_conditions: dict) -> None:
metric_conditions["predicted_sex_sex_check"]["threshold"] = sample.sex
metric_conditions["gender"]["threshold"] = sample.sex

@staticmethod
def set_adapter_bases_for_sample(sample: Sample, metric_conditions: dict) -> None:
"""Calculate threshold for maximum number of adapter bases for a given sample"""
adapter_bases_threshold = (
sample.reads
* NOVASEQ_SEQUENCING_READ_LENGTH
* RAREDISEASE_ADAPTER_BASES_PERCENTAGE_THRESHOLD
)
metric_conditions["adapter_cutting_adapter_trimmed_reads"][
"threshold"
] = adapter_bases_threshold

def get_sample_coverage_file_path(self, bundle_name: str, sample_id: str) -> str | None:
"""Return the Raredisease d4 coverage file path."""
coverage_file_tags: list[str] = RAREDISEASE_COVERAGE_FILE_TAGS + [sample_id]
Expand Down
70 changes: 68 additions & 2 deletions tests/fixtures/analysis/raredisease/multiqc_data.json
Original file line number Diff line number Diff line change
Expand Up @@ -210,8 +210,42 @@
"het_ratio_sex_check": 0.03753,
"predicted_sex_sex_check": "female",
"error_sex_check": "True"
},
"ADM2_1": {
}
},
{
"ADM1": {
"filtering_result_passed_filter_reads": 104595780.0,
"filtering_result_corrected_reads": 1692428.0,
"filtering_result_corrected_bases": 3103038.0,
"filtering_result_low_quality_reads": 3135572.0,
"filtering_result_too_many_N_reads": 37730.0,
"filtering_result_too_short_reads": 153994.0,
"filtering_result_too_long_reads": 0.0,
"pct_duplication": 2.92768,
"after_filtering_total_reads": 104595780.0,
"after_filtering_total_bases": 15746394068.0,
"after_filtering_q20_bases": 15488662257.0,
"after_filtering_q30_bases": 14991083442.0,
"after_filtering_q20_rate": 0.983632,
"after_filtering_q30_rate": 0.952033,
"after_filtering_read1_mean_length": 150.0,
"after_filtering_read2_mean_length": 150.0,
"after_filtering_gc_content": 0.409313,
"before_filtering_total_reads": 107923076.0,
"pct_surviving": 96.91697445688075,
"adapter_cutting_adapter_trimmed_reads": 1365798.0,
"adapter_cutting_adapter_trimmed_bases": 65582892.0,
"pct_adapter": 1.265529162641732
}
},
{
"ADM1": {
"Contamination Status": "NO",
"Contamination Level": "ND"
}
},
{
"ADM2_1": {
"percent_gc": 40.0,
"avg_sequence_length": 151.0,
"median_sequence_length": 151,
Expand Down Expand Up @@ -420,6 +454,38 @@
"predicted_sex_sex_check": "female",
"error_sex_check": "True"
}
},
{
"ADM2": {
"filtering_result_passed_filter_reads": 104595780.0,
"filtering_result_corrected_reads": 1692428.0,
"filtering_result_corrected_bases": 3103038.0,
"filtering_result_low_quality_reads": 3135572.0,
"filtering_result_too_many_N_reads": 37730.0,
"filtering_result_too_short_reads": 153994.0,
"filtering_result_too_long_reads": 0.0,
"pct_duplication": 2.92768,
"after_filtering_total_reads": 104595780.0,
"after_filtering_total_bases": 15746394068.0,
"after_filtering_q20_bases": 15488662257.0,
"after_filtering_q30_bases": 14991083442.0,
"after_filtering_q20_rate": 0.983632,
"after_filtering_q30_rate": 0.952033,
"after_filtering_read1_mean_length": 150.0,
"after_filtering_read2_mean_length": 150.0,
"after_filtering_gc_content": 0.409313,
"before_filtering_total_reads": 107923076.0,
"pct_surviving": 96.91697445688075,
"adapter_cutting_adapter_trimmed_reads": 1365798.0,
"adapter_cutting_adapter_trimmed_bases": 65582892.0,
"pct_adapter": 1.265529162641732
}
},
{
"ADM2": {
"Contamination Status": "NO",
"Contamination Level": "ND"
}
}
],
"report_saved_raw_data": {
Expand Down
Loading
Loading