Skip to content

Commit

Permalink
Handle importing GFF3s where exons are implicit (#492)
Browse files Browse the repository at this point in the history
* Start adding test and possibly some fixes for #481

Conversion to GFF3Feature includes source and score and sets ID and
Parent attributes. However, export to GFF from UI fails.

* Fix imports

* Test expanded

* Process CDSs, tests extended

* Temp commit: Need to fix handling of missing refSeq and proper testing

* Add test - need to test with spliced UTRs

* Fix header and incorrect phase (not used by Apollo anyway)

* Handle spliced UTRs

* Fix tests

* Re-enable skipped tests

---------

Co-authored-by: Garrett Stevens <[email protected]>
  • Loading branch information
dariober and garrettjstevens authored Jan 23, 2025
1 parent 8f1c5d7 commit 06695e9
Show file tree
Hide file tree
Showing 10 changed files with 442 additions and 9 deletions.
31 changes: 31 additions & 0 deletions packages/apollo-shared/src/GFF3/gff3ToAnnotationFeature.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,37 @@ describe('gff3ToAnnotationFeature examples', () => {
})
})

describe('CDS without exons', () => {
it('Convert mRNA with CDS but without exon', () => {
const [gffFeature] = readFeatureFile('test_data/cds_without_exon.gff')
const actual = gff3ToAnnotationFeature(gffFeature)
const expected = readAnnotationFeatureSnapshot(
'test_data/cds_without_exon.json',
)
compareFeatures(actual, expected)
})
it('Convert mRNA with CDS but without exon and spliced UTR', () => {
const [gffFeature] = readFeatureFile(
'test_data/cds_without_exon_spliced_utr.gff',
)
const actual = gff3ToAnnotationFeature(gffFeature)
const expected = readAnnotationFeatureSnapshot(
'test_data/cds_without_exon_spliced_utr.json',
)
compareFeatures(actual, expected)
})
it('Convert mRNA with one CDS, without exons non-adjacent UTR', () => {
const [gffFeature] = readFeatureFile(
'test_data/onecds_without_exon_spliced_utr.gff',
)
const actual = gff3ToAnnotationFeature(gffFeature)
const expected = readAnnotationFeatureSnapshot(
'test_data/onecds_without_exon_spliced_utr.json',
)
compareFeatures(actual, expected)
})
})

describe('gff3ToAnnotationFeature', () => {
for (const testCase of testCases) {
const [description, featureLine, convertedFeature] = testCase
Expand Down
175 changes: 171 additions & 4 deletions packages/apollo-shared/src/GFF3/gff3ToAnnotationFeature.ts
Original file line number Diff line number Diff line change
Expand Up @@ -157,8 +157,19 @@ function convertChildren(
const { child_features: childFeatures } = firstFeature

const cdsFeatures: GFF3Feature[] = []
const exonFeatures: GFF3Feature[] = []
const utrFeatures: GFF3Feature[] = []
for (const childFeature of childFeatures) {
const [firstChildFeatureLocation] = childFeature
if (firstChildFeatureLocation.type === 'exon') {
exonFeatures.push(childFeature)
}
if (
firstChildFeatureLocation.type === 'three_prime_UTR' ||
firstChildFeatureLocation.type === 'five_prime_UTR'
) {
utrFeatures.push(childFeature)
}
if (
firstChildFeatureLocation.type === 'three_prime_UTR' ||
firstChildFeatureLocation.type === 'five_prime_UTR' ||
Expand All @@ -175,10 +186,23 @@ function convertChildren(
convertedChildren[child._id] = child
}
}
const processedCDS =
cdsFeatures.length > 0 ? processCDS(cdsFeatures, refSeq, featureIds) : []
for (const cds of processedCDS) {
convertedChildren[cds._id] = cds

if (cdsFeatures.length > 0) {
const processedCDS = processCDS(cdsFeatures, refSeq, featureIds)

for (const cds of processedCDS) {
convertedChildren[cds._id] = cds
}

const missingExons = inferMissingExons(
cdsFeatures,
exonFeatures,
utrFeatures,
processedCDS[0].refSeq,
)
for (const exon of missingExons) {
convertedChildren[exon._id] = exon
}
}

if (Object.keys(convertedChildren).length > 0) {
Expand All @@ -187,6 +211,149 @@ function convertChildren(
return
}

function inferMissingExons(
cdsFeatures: GFF3Feature[],
existingExons: GFF3Feature[],
utrFeatures: GFF3Feature[],
refSeq: string,
): AnnotationFeatureSnapshot[] {
// Convert utrFeatures from GFF3Feature to AnnotationFeatureSnapshot
const utrExons: AnnotationFeatureSnapshot[] = []
for (const utrs of utrFeatures) {
for (const utr of utrs) {
if (!utr.start || !utr.end) {
throw new Error(
`UTR has undefined start and/or end\n: ${JSON.stringify(utr, null, 2)}`,
)
}
let strand: 1 | -1 | undefined = undefined
if (utr.strand === '+') {
strand = 1
} else if (utr.strand === '-') {
strand = -1
}
utrExons.push({
_id: new ObjectID().toHexString(),
refSeq,
type: 'exon',
min: utr.start - 1,
max: utr.end,
strand,
})
}
}
utrExons.sort((a, b) => a.min - b.min)

const missingExons: AnnotationFeatureSnapshot[] = []
for (const protein of cdsFeatures) {
protein.sort((a, b) => {
if (!a.start || !b.start) {
throw new Error('CDS has undefined start')
}
return a.start - b.start
})
for (let cdsIdx = 0; cdsIdx < protein.length; cdsIdx++) {
const cds = protein[cdsIdx]
// For CDS check if there is an exon containing it. If not, create an exon with same coords as the CDS.
let exonFound = false
for (const x of existingExons) {
if (x.length != 1) {
throw new Error('Unexpected number of exons')
}
const [exon] = x
if (
exon.start &&
exon.end &&
cds.start &&
cds.end &&
exon.start <= cds.start &&
exon.end >= cds.end
) {
exonFound = true
break
}
}
if (!exonFound) {
if (!cds.start || !cds.end) {
throw new Error(
`CDS has undefined start and/or end: ${JSON.stringify(cds, null, 2)}`,
)
}
let strand: 1 | -1 | undefined = undefined
if (cds.strand === '+') {
strand = 1
} else if (cds.strand === '-') {
strand = -1
}
const newExon: AnnotationFeatureSnapshot = {
_id: new ObjectID().toHexString(),
refSeq,
type: 'exon',
min: cds.start - 1,
max: cds.end,
strand,
}
if (cdsIdx === 0) {
// If this CDS is the leftmost (or the only CDS in this protein), check if we need to add UTRs before it
for (const utr of utrExons) {
if (utr.max > newExon.min) {
break
}
if (utr.max === newExon.min) {
// UTR ends where exon begins: Extend the exon to include this UTR
newExon.min = utr.min
} else {
missingExons.push(utr)
}
}
}
if (cdsIdx === protein.length - 1) {
// If this CDS is the rightmost (or the only CDS in this protein), check if we need to add UTRs after it
for (const utr of utrExons) {
if (utr.min < newExon.max) {
continue
}
if (utr.min === newExon.max) {
// UTR begins where exon end: Extend the exon to include this UTR
newExon.max = utr.max
} else {
missingExons.push(utr)
}
}
}
missingExons.push(newExon)
}
}
}
const mergedExons = mergeAnnotationFeatures(missingExons)
return mergedExons
}

function mergeAnnotationFeatures(
features: AnnotationFeatureSnapshot[],
): AnnotationFeatureSnapshot[] {
if (features.length === 0) {
return []
}
features.sort((a, b) => a.min - b.min)

const res = []
res.push(features[0])

for (let i = 1; i < features.length; i++) {
const last = res.at(-1)
const curr = features[i]

// If current interval overlaps with the last merged interval, merge them
if (last && curr.min <= last.max) {
last.max = Math.max(last.max, curr.max)
} else {
res.push(curr)
}
}
return res
}

/**
* If a GFF3 file has CDS features that either (1) don't have an ID or (2) have
* different IDs for each CDS, we have to do a bit of guessing about how they
Expand Down
12 changes: 12 additions & 0 deletions packages/apollo-shared/test_data/cds_without_exon.gff
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
##gff-version 3
##sequence-region 53cb6b9b4f4ddef1ad47f943 1050 9000
ctgA example gene 1050 9000 . + . ID=eden
ctgA example mRNA 1050 9000 . + . ID=eden.1;Parent=eden
ctgA example five_prime_UTR 1050 1210 . + 0 ID=five1;Parent=eden.1
ctgA example CDS 1211 1510 . + 0 ID=cds2;Parent=eden.1
ctgA example CDS 1611 1710 . + 0 ID=cds2;Parent=eden.1
ctgA example three_prime_UTR 1711 1800 . + 0 ID=three1;Parent=eden.1
ctgA example exon 1201 1500 . + 0 ID=exon1;Parent=eden.1
ctgA example CDS 1601 1700 . + 0 ID=cds1;Parent=eden.1
ctgA example CDS 1201 1500 . + 0 ID=cds1;Parent=eden.1
ctgA example TF_binding_site 1050 1100 . + . Parent=eden
74 changes: 74 additions & 0 deletions packages/apollo-shared/test_data/cds_without_exon.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
{
"_id": "67581b7d5890a8eb1bedab6e",
"refSeq": "ctgA",
"type": "gene",
"min": 1049,
"max": 9000,
"strand": 1,
"children": {
"67581b7d5890a8eb1bedab6c": {
"_id": "67581b7d5890a8eb1bedab6c",
"refSeq": "ctgA",
"type": "mRNA",
"min": 1049,
"max": 9000,
"strand": 1,
"children": {
"67581b7d5890a8eb1bedab66": {
"_id": "67581b7d5890a8eb1bedab66",
"refSeq": "ctgA",
"type": "exon",
"min": 1200,
"max": 1500,
"strand": 1,
"attributes": { "gff_source": ["example"], "gff_id": ["exon1"] }
},
"67581b7d5890a8eb1bedab67": {
"_id": "67581b7d5890a8eb1bedab67",
"refSeq": "ctgA",
"type": "CDS",
"min": 1210,
"max": 1710,
"strand": 1,
"attributes": { "gff_source": ["example"], "gff_id": ["cds2"] }
},
"67581b7d5890a8eb1bedab68": {
"_id": "67581b7d5890a8eb1bedab68",
"refSeq": "ctgA",
"type": "CDS",
"min": 1200,
"max": 1700,
"strand": 1,
"attributes": { "gff_source": ["example"], "gff_id": ["cds1"] }
},
"67581b7d5890a8eb1bedab69": {
"_id": "67581b7d5890a8eb1bedab69",
"refSeq": "ctgA",
"type": "exon",
"min": 1049,
"max": 1510,
"strand": 1
},
"67581b7d5890a8eb1bedab6b": {
"_id": "67581b7d5890a8eb1bedab6b",
"refSeq": "ctgA",
"type": "exon",
"min": 1600,
"max": 1800,
"strand": 1
}
},
"attributes": { "gff_source": ["example"], "gff_id": ["eden.1"] }
},
"67581b7d5890a8eb1bedab6d": {
"_id": "67581b7d5890a8eb1bedab6d",
"refSeq": "ctgA",
"type": "TF_binding_site",
"min": 1049,
"max": 1100,
"strand": 1,
"attributes": { "gff_source": ["example"] }
}
},
"attributes": { "gff_source": ["example"], "gff_id": ["eden"] }
}
14 changes: 14 additions & 0 deletions packages/apollo-shared/test_data/cds_without_exon_spliced_utr.gff
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
##gff-version 3
chr1 . gene 1000 9000 . + . ID=gene00001
#chr1 . mRNA 1300 9000 . + . ID=mRNA00001;Parent=gene00001
#chr1 . five_prime_UTR 3000 3300 . + . Parent=mRNA00001
#chr1 . CDS 5000 5500 . + 1 ID=cds00001;Parent=mRNA00001
#chr1 . three_prime_UTR 7601 8000 . + . Parent=mRNA00001
chr1 . mRNA 1300 9000 . + . ID=mRNA00003;Parent=gene00001
chr1 . five_prime_UTR 1300 1500 . + . Parent=mRNA00003
chr1 . five_prime_UTR 3000 3300 . + . Parent=mRNA00003
chr1 . CDS 3301 3902 . + 0 ID=cds00003;Parent=mRNA00003
chr1 . CDS 5000 5500 . + 1 ID=cds00003;Parent=mRNA00003
chr1 . CDS 7000 7600 . + 2 ID=cds00003;Parent=mRNA00003
chr1 . three_prime_UTR 7601 8000 . + . Parent=mRNA00003
chr1 . three_prime_UTR 8501 8900 . + . Parent=mRNA00003
Loading

0 comments on commit 06695e9

Please sign in to comment.