Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle importing GFF3s where exons are implicit #492

Merged
merged 11 commits into from
Jan 23, 2025
31 changes: 31 additions & 0 deletions packages/apollo-shared/src/GFF3/gff3ToAnnotationFeature.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,37 @@ describe('gff3ToAnnotationFeature examples', () => {
})
})

describe('CDS without exons', () => {
it('Convert mRNA with CDS but without exon', () => {
const [gffFeature] = readFeatureFile('test_data/cds_without_exon.gff')
const actual = gff3ToAnnotationFeature(gffFeature)
const expected = readAnnotationFeatureSnapshot(
'test_data/cds_without_exon.json',
)
compareFeatures(actual, expected)
})
it('Convert mRNA with CDS but without exon and spliced UTR', () => {
const [gffFeature] = readFeatureFile(
'test_data/cds_without_exon_spliced_utr.gff',
)
const actual = gff3ToAnnotationFeature(gffFeature)
const expected = readAnnotationFeatureSnapshot(
'test_data/cds_without_exon_spliced_utr.json',
)
compareFeatures(actual, expected)
})
it('Convert mRNA with one CDS, without exons non-adjacent UTR', () => {
const [gffFeature] = readFeatureFile(
'test_data/onecds_without_exon_spliced_utr.gff',
)
const actual = gff3ToAnnotationFeature(gffFeature)
const expected = readAnnotationFeatureSnapshot(
'test_data/onecds_without_exon_spliced_utr.json',
)
compareFeatures(actual, expected)
})
})

describe('gff3ToAnnotationFeature', () => {
for (const testCase of testCases) {
const [description, featureLine, convertedFeature] = testCase
Expand Down
175 changes: 171 additions & 4 deletions packages/apollo-shared/src/GFF3/gff3ToAnnotationFeature.ts
Original file line number Diff line number Diff line change
Expand Up @@ -157,8 +157,19 @@ function convertChildren(
const { child_features: childFeatures } = firstFeature

const cdsFeatures: GFF3Feature[] = []
const exonFeatures: GFF3Feature[] = []
const utrFeatures: GFF3Feature[] = []
for (const childFeature of childFeatures) {
const [firstChildFeatureLocation] = childFeature
if (firstChildFeatureLocation.type === 'exon') {
exonFeatures.push(childFeature)
}
if (
firstChildFeatureLocation.type === 'three_prime_UTR' ||
firstChildFeatureLocation.type === 'five_prime_UTR'
) {
utrFeatures.push(childFeature)
}
if (
firstChildFeatureLocation.type === 'three_prime_UTR' ||
firstChildFeatureLocation.type === 'five_prime_UTR' ||
Expand All @@ -175,10 +186,23 @@ function convertChildren(
convertedChildren[child._id] = child
}
}
const processedCDS =
cdsFeatures.length > 0 ? processCDS(cdsFeatures, refSeq, featureIds) : []
for (const cds of processedCDS) {
convertedChildren[cds._id] = cds

if (cdsFeatures.length > 0) {
const processedCDS = processCDS(cdsFeatures, refSeq, featureIds)

for (const cds of processedCDS) {
convertedChildren[cds._id] = cds
}

const missingExons = inferMissingExons(
cdsFeatures,
exonFeatures,
utrFeatures,
processedCDS[0].refSeq,
)
for (const exon of missingExons) {
convertedChildren[exon._id] = exon
}
}

if (Object.keys(convertedChildren).length > 0) {
Expand All @@ -187,6 +211,149 @@ function convertChildren(
return
}

function inferMissingExons(
cdsFeatures: GFF3Feature[],
existingExons: GFF3Feature[],
utrFeatures: GFF3Feature[],
refSeq: string,
): AnnotationFeatureSnapshot[] {
// Convert utrFeatures from GFF3Feature to AnnotationFeatureSnapshot
const utrExons: AnnotationFeatureSnapshot[] = []
for (const utrs of utrFeatures) {
for (const utr of utrs) {
if (!utr.start || !utr.end) {
throw new Error(
`UTR has undefined start and/or end\n: ${JSON.stringify(utr, null, 2)}`,
)
}
let strand: 1 | -1 | undefined = undefined
if (utr.strand === '+') {
strand = 1
} else if (utr.strand === '-') {
strand = -1
}
utrExons.push({
_id: new ObjectID().toHexString(),
refSeq,
type: 'exon',
min: utr.start - 1,
max: utr.end,
strand,
})
}
}
utrExons.sort((a, b) => a.min - b.min)

const missingExons: AnnotationFeatureSnapshot[] = []
for (const protein of cdsFeatures) {
protein.sort((a, b) => {
if (!a.start || !b.start) {
throw new Error('CDS has undefined start')
}
return a.start - b.start
})
for (let cdsIdx = 0; cdsIdx < protein.length; cdsIdx++) {
const cds = protein[cdsIdx]
// For CDS check if there is an exon containing it. If not, create an exon with same coords as the CDS.
let exonFound = false
for (const x of existingExons) {
if (x.length != 1) {
throw new Error('Unexpected number of exons')
}
const [exon] = x
if (
exon.start &&
exon.end &&
cds.start &&
cds.end &&
exon.start <= cds.start &&
exon.end >= cds.end
) {
exonFound = true
break
}
}
if (!exonFound) {
if (!cds.start || !cds.end) {
throw new Error(
`CDS has undefined start and/or end: ${JSON.stringify(cds, null, 2)}`,
)
}
let strand: 1 | -1 | undefined = undefined
if (cds.strand === '+') {
strand = 1
} else if (cds.strand === '-') {
strand = -1
}
const newExon: AnnotationFeatureSnapshot = {
_id: new ObjectID().toHexString(),
refSeq,
type: 'exon',
min: cds.start - 1,
max: cds.end,
strand,
}
if (cdsIdx === 0) {
// If this CDS is the leftmost (or the only CDS in this protein), check if we need to add UTRs before it
for (const utr of utrExons) {
if (utr.max > newExon.min) {
break
}
if (utr.max === newExon.min) {
// UTR ends where exon begins: Extend the exon to include this UTR
newExon.min = utr.min
} else {
missingExons.push(utr)
}
}
}
if (cdsIdx === protein.length - 1) {
// If this CDS is the rightmost (or the only CDS in this protein), check if we need to add UTRs after it
for (const utr of utrExons) {
if (utr.min < newExon.max) {
continue
}
if (utr.min === newExon.max) {
// UTR begins where exon end: Extend the exon to include this UTR
newExon.max = utr.max
} else {
missingExons.push(utr)
}
}
}
missingExons.push(newExon)
}
}
}
const mergedExons = mergeAnnotationFeatures(missingExons)
return mergedExons
}

function mergeAnnotationFeatures(
features: AnnotationFeatureSnapshot[],
): AnnotationFeatureSnapshot[] {
if (features.length === 0) {
return []
}
features.sort((a, b) => a.min - b.min)

const res = []
res.push(features[0])

for (let i = 1; i < features.length; i++) {
const last = res.at(-1)
const curr = features[i]

// If current interval overlaps with the last merged interval, merge them
if (last && curr.min <= last.max) {
last.max = Math.max(last.max, curr.max)
} else {
res.push(curr)
}
}
return res
}

/**
* If a GFF3 file has CDS features that either (1) don't have an ID or (2) have
* different IDs for each CDS, we have to do a bit of guessing about how they
Expand Down
12 changes: 12 additions & 0 deletions packages/apollo-shared/test_data/cds_without_exon.gff
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
##gff-version 3
##sequence-region 53cb6b9b4f4ddef1ad47f943 1050 9000
ctgA example gene 1050 9000 . + . ID=eden
ctgA example mRNA 1050 9000 . + . ID=eden.1;Parent=eden
ctgA example five_prime_UTR 1050 1210 . + 0 ID=five1;Parent=eden.1
ctgA example CDS 1211 1510 . + 0 ID=cds2;Parent=eden.1
ctgA example CDS 1611 1710 . + 0 ID=cds2;Parent=eden.1
ctgA example three_prime_UTR 1711 1800 . + 0 ID=three1;Parent=eden.1
ctgA example exon 1201 1500 . + 0 ID=exon1;Parent=eden.1
ctgA example CDS 1601 1700 . + 0 ID=cds1;Parent=eden.1
ctgA example CDS 1201 1500 . + 0 ID=cds1;Parent=eden.1
ctgA example TF_binding_site 1050 1100 . + . Parent=eden
74 changes: 74 additions & 0 deletions packages/apollo-shared/test_data/cds_without_exon.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
{
"_id": "67581b7d5890a8eb1bedab6e",
"refSeq": "ctgA",
"type": "gene",
"min": 1049,
"max": 9000,
"strand": 1,
"children": {
"67581b7d5890a8eb1bedab6c": {
"_id": "67581b7d5890a8eb1bedab6c",
"refSeq": "ctgA",
"type": "mRNA",
"min": 1049,
"max": 9000,
"strand": 1,
"children": {
"67581b7d5890a8eb1bedab66": {
"_id": "67581b7d5890a8eb1bedab66",
"refSeq": "ctgA",
"type": "exon",
"min": 1200,
"max": 1500,
"strand": 1,
"attributes": { "gff_source": ["example"], "gff_id": ["exon1"] }
},
"67581b7d5890a8eb1bedab67": {
"_id": "67581b7d5890a8eb1bedab67",
"refSeq": "ctgA",
"type": "CDS",
"min": 1210,
"max": 1710,
"strand": 1,
"attributes": { "gff_source": ["example"], "gff_id": ["cds2"] }
},
"67581b7d5890a8eb1bedab68": {
"_id": "67581b7d5890a8eb1bedab68",
"refSeq": "ctgA",
"type": "CDS",
"min": 1200,
"max": 1700,
"strand": 1,
"attributes": { "gff_source": ["example"], "gff_id": ["cds1"] }
},
"67581b7d5890a8eb1bedab69": {
"_id": "67581b7d5890a8eb1bedab69",
"refSeq": "ctgA",
"type": "exon",
"min": 1049,
"max": 1510,
"strand": 1
},
"67581b7d5890a8eb1bedab6b": {
"_id": "67581b7d5890a8eb1bedab6b",
"refSeq": "ctgA",
"type": "exon",
"min": 1600,
"max": 1800,
"strand": 1
}
},
"attributes": { "gff_source": ["example"], "gff_id": ["eden.1"] }
},
"67581b7d5890a8eb1bedab6d": {
"_id": "67581b7d5890a8eb1bedab6d",
"refSeq": "ctgA",
"type": "TF_binding_site",
"min": 1049,
"max": 1100,
"strand": 1,
"attributes": { "gff_source": ["example"] }
}
},
"attributes": { "gff_source": ["example"], "gff_id": ["eden"] }
}
14 changes: 14 additions & 0 deletions packages/apollo-shared/test_data/cds_without_exon_spliced_utr.gff
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
##gff-version 3
chr1 . gene 1000 9000 . + . ID=gene00001
#chr1 . mRNA 1300 9000 . + . ID=mRNA00001;Parent=gene00001
#chr1 . five_prime_UTR 3000 3300 . + . Parent=mRNA00001
#chr1 . CDS 5000 5500 . + 1 ID=cds00001;Parent=mRNA00001
#chr1 . three_prime_UTR 7601 8000 . + . Parent=mRNA00001
chr1 . mRNA 1300 9000 . + . ID=mRNA00003;Parent=gene00001
chr1 . five_prime_UTR 1300 1500 . + . Parent=mRNA00003
chr1 . five_prime_UTR 3000 3300 . + . Parent=mRNA00003
chr1 . CDS 3301 3902 . + 0 ID=cds00003;Parent=mRNA00003
chr1 . CDS 5000 5500 . + 1 ID=cds00003;Parent=mRNA00003
chr1 . CDS 7000 7600 . + 2 ID=cds00003;Parent=mRNA00003
chr1 . three_prime_UTR 7601 8000 . + . Parent=mRNA00003
chr1 . three_prime_UTR 8501 8900 . + . Parent=mRNA00003
Loading
Loading