From 4a7cd4d4aced97bdb6df45226d8b59eb00e20bb4 Mon Sep 17 00:00:00 2001 From: Luan Nguyen Date: Wed, 2 Oct 2024 20:04:15 +1000 Subject: [PATCH] Chord: Implemented extractSigsSnv() R function in java --- .../hartwig/hmftools/chord/prep/SnvPrep.java | 123 ++++++++++++++++++ 1 file changed, 123 insertions(+) create mode 100644 chord/src/main/java/com/hartwig/hmftools/chord/prep/SnvPrep.java diff --git a/chord/src/main/java/com/hartwig/hmftools/chord/prep/SnvPrep.java b/chord/src/main/java/com/hartwig/hmftools/chord/prep/SnvPrep.java new file mode 100644 index 0000000000..d5cb82c448 --- /dev/null +++ b/chord/src/main/java/com/hartwig/hmftools/chord/prep/SnvPrep.java @@ -0,0 +1,123 @@ +package com.hartwig.hmftools.chord.prep; + +import static com.hartwig.hmftools.chord.ChordConstants.CHORD_LOGGER; + +import java.nio.file.NoSuchFileException; +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; + +import com.hartwig.hmftools.chord.ChordConfig; +import com.hartwig.hmftools.chord.variant.VcfFile; +import com.hartwig.hmftools.common.sigs.SnvSigUtils; +import com.hartwig.hmftools.common.variant.SageVcfTags; + +import htsjdk.variant.variantcontext.VariantContext; + +public class SnvPrep +{ + private final ChordConfig mConfig; + + public SnvPrep(ChordConfig config) + { + mConfig = config; + } + + private List loadVariants(String sampleId) throws NoSuchFileException + { + VcfFile vcfFile = new VcfFile(mConfig.purpleSomaticVcfFile(sampleId), mConfig.IncludeNonPass); + return vcfFile.loadVariants(); + } + + private static Map countTriNucContexts(List variantContexts) + { + // Get bin names + // Returns: C>A_ACA -> 0, C>A_ACC -> 1, C>A_ACG -> 2, etc. We only need the name (i.e. key) + Map triNucNameIndexMap = new LinkedHashMap<>(); + SnvSigUtils.populateBucketMap(triNucNameIndexMap); + + // Initialize count vector + Map triNucNameCountsMap = new LinkedHashMap<>(); + triNucNameIndexMap.keySet().forEach(i -> triNucNameCountsMap.put(i, 0)); + + // Count trinucleotide contexts + int snvCount = 0; + for(VariantContext variantContext : variantContexts) + { + String refSeq = variantContext.getReference().getBaseString(); + String altSeq = variantContext.getAlternateAllele(0).getDisplayString(); + + boolean isSnv = refSeq.length()==1 && altSeq.length()==1; + if(!isSnv) + continue; + + String triNucSequence = variantContext.getAttributeAsString(SageVcfTags.TRINUCLEOTIDE_CONTEXT, null); + String triNucContext = SnvSigUtils.variantContext(refSeq, altSeq, triNucSequence); + + // CHORD_LOGGER.trace("{}:{}:{}:{} {}", + // variantContext.getContig(), variantContext.getStart(), refSeq, altSeq, renameTriNucBin(triNucContext)); + + triNucNameCountsMap.compute(triNucContext, (k,v) -> v + 1); + + snvCount++; + } + + CHORD_LOGGER.debug("Counted trinucleotide contexts for {} SNVs", snvCount); + + return triNucNameCountsMap; + } + + private static String renameTriNucBin(String bucketName) + { + // Convert e.g. "C>A_ACA" to "A[C>A]A" + // The latter is the format required by the CHORD random forest model + + String[] bucketNameSplit = bucketName.split("_"); + + String substitutionType = bucketNameSplit[0]; + char[] triNucSequence = bucketNameSplit[1].toCharArray(); + + return String.format("%s[%s]%s", triNucSequence[0], substitutionType, triNucSequence[2]); + } + + private static List makeCountsList(Map triNucCounts) + { + List counts = new ArrayList<>(); + + for(String binName : triNucCounts.keySet()) + { + String newBinName = renameTriNucBin(binName); + int count = triNucCounts.get(binName); + + MutTypeCount mutTypeCount = new MutTypeCount(newBinName, count); + + CHORD_LOGGER.trace(mutTypeCount); + + counts.add(new MutTypeCount(newBinName, count)); + } + + return counts; + } + + public List extractSampleData(String sampleId) + { + try + { + CHORD_LOGGER.info("Counting SNV 96 trinucleotide contexts"); + + List variantContexts = loadVariants(sampleId); + Map triNucCountsMap = countTriNucContexts(variantContexts); + List triNucCountsList = makeCountsList(triNucCountsMap); + + return triNucCountsList; + } + catch(Exception e) + { + CHORD_LOGGER.error("sample({}) failed to count SNV trinucleotide contexts:", sampleId); + e.printStackTrace(); + System.exit(1); + return null; + } + } +}