From 06695e9dfeb84617a9f89528ab8eec6ab5cc6215 Mon Sep 17 00:00:00 2001 From: Dario Beraldi Date: Thu, 23 Jan 2025 04:08:04 +0000 Subject: [PATCH] Handle importing GFF3s where exons are implicit (#492) * Start adding test and possibly some fixes for #481 Conversion to GFF3Feature includes source and score and sets ID and Parent attributes. However, export to GFF from UI fails. * Fix imports * Test expanded * Process CDSs, tests extended * Temp commit: Need to fix handling of missing refSeq and proper testing * Add test - need to test with spliced UTRs * Fix header and incorrect phase (not used by Apollo anyway) * Handle spliced UTRs * Fix tests * Re-enable skipped tests --------- Co-authored-by: Garrett Stevens --- .../src/GFF3/gff3ToAnnotationFeature.test.ts | 31 ++++ .../src/GFF3/gff3ToAnnotationFeature.ts | 175 +++++++++++++++++- .../test_data/cds_without_exon.gff | 12 ++ .../test_data/cds_without_exon.json | 74 ++++++++ .../cds_without_exon_spliced_utr.gff | 14 ++ .../cds_without_exon_spliced_utr.json | 73 ++++++++ .../test_data/gene_representations.gff3 | 9 +- .../onecds_without_exon_spliced_utr.gff | 6 + .../onecds_without_exon_spliced_utr.json | 55 ++++++ .../cypress/e2e/editFeature.cy.ts | 2 +- 10 files changed, 442 insertions(+), 9 deletions(-) create mode 100644 packages/apollo-shared/test_data/cds_without_exon.gff create mode 100644 packages/apollo-shared/test_data/cds_without_exon.json create mode 100644 packages/apollo-shared/test_data/cds_without_exon_spliced_utr.gff create mode 100644 packages/apollo-shared/test_data/cds_without_exon_spliced_utr.json create mode 100644 packages/apollo-shared/test_data/onecds_without_exon_spliced_utr.gff create mode 100644 packages/apollo-shared/test_data/onecds_without_exon_spliced_utr.json diff --git a/packages/apollo-shared/src/GFF3/gff3ToAnnotationFeature.test.ts b/packages/apollo-shared/src/GFF3/gff3ToAnnotationFeature.test.ts index 9fa2682ab..a98ad905b 100644 --- a/packages/apollo-shared/src/GFF3/gff3ToAnnotationFeature.test.ts +++ b/packages/apollo-shared/src/GFF3/gff3ToAnnotationFeature.test.ts @@ -197,6 +197,37 @@ describe('gff3ToAnnotationFeature examples', () => { }) }) +describe('CDS without exons', () => { + it('Convert mRNA with CDS but without exon', () => { + const [gffFeature] = readFeatureFile('test_data/cds_without_exon.gff') + const actual = gff3ToAnnotationFeature(gffFeature) + const expected = readAnnotationFeatureSnapshot( + 'test_data/cds_without_exon.json', + ) + compareFeatures(actual, expected) + }) + it('Convert mRNA with CDS but without exon and spliced UTR', () => { + const [gffFeature] = readFeatureFile( + 'test_data/cds_without_exon_spliced_utr.gff', + ) + const actual = gff3ToAnnotationFeature(gffFeature) + const expected = readAnnotationFeatureSnapshot( + 'test_data/cds_without_exon_spliced_utr.json', + ) + compareFeatures(actual, expected) + }) + it('Convert mRNA with one CDS, without exons non-adjacent UTR', () => { + const [gffFeature] = readFeatureFile( + 'test_data/onecds_without_exon_spliced_utr.gff', + ) + const actual = gff3ToAnnotationFeature(gffFeature) + const expected = readAnnotationFeatureSnapshot( + 'test_data/onecds_without_exon_spliced_utr.json', + ) + compareFeatures(actual, expected) + }) +}) + describe('gff3ToAnnotationFeature', () => { for (const testCase of testCases) { const [description, featureLine, convertedFeature] = testCase diff --git a/packages/apollo-shared/src/GFF3/gff3ToAnnotationFeature.ts b/packages/apollo-shared/src/GFF3/gff3ToAnnotationFeature.ts index e5b1a45ce..69b41fc4c 100644 --- a/packages/apollo-shared/src/GFF3/gff3ToAnnotationFeature.ts +++ b/packages/apollo-shared/src/GFF3/gff3ToAnnotationFeature.ts @@ -157,8 +157,19 @@ function convertChildren( const { child_features: childFeatures } = firstFeature const cdsFeatures: GFF3Feature[] = [] + const exonFeatures: GFF3Feature[] = [] + const utrFeatures: GFF3Feature[] = [] for (const childFeature of childFeatures) { const [firstChildFeatureLocation] = childFeature + if (firstChildFeatureLocation.type === 'exon') { + exonFeatures.push(childFeature) + } + if ( + firstChildFeatureLocation.type === 'three_prime_UTR' || + firstChildFeatureLocation.type === 'five_prime_UTR' + ) { + utrFeatures.push(childFeature) + } if ( firstChildFeatureLocation.type === 'three_prime_UTR' || firstChildFeatureLocation.type === 'five_prime_UTR' || @@ -175,10 +186,23 @@ function convertChildren( convertedChildren[child._id] = child } } - const processedCDS = - cdsFeatures.length > 0 ? processCDS(cdsFeatures, refSeq, featureIds) : [] - for (const cds of processedCDS) { - convertedChildren[cds._id] = cds + + if (cdsFeatures.length > 0) { + const processedCDS = processCDS(cdsFeatures, refSeq, featureIds) + + for (const cds of processedCDS) { + convertedChildren[cds._id] = cds + } + + const missingExons = inferMissingExons( + cdsFeatures, + exonFeatures, + utrFeatures, + processedCDS[0].refSeq, + ) + for (const exon of missingExons) { + convertedChildren[exon._id] = exon + } } if (Object.keys(convertedChildren).length > 0) { @@ -187,6 +211,149 @@ function convertChildren( return } +function inferMissingExons( + cdsFeatures: GFF3Feature[], + existingExons: GFF3Feature[], + utrFeatures: GFF3Feature[], + refSeq: string, +): AnnotationFeatureSnapshot[] { + // Convert utrFeatures from GFF3Feature to AnnotationFeatureSnapshot + const utrExons: AnnotationFeatureSnapshot[] = [] + for (const utrs of utrFeatures) { + for (const utr of utrs) { + if (!utr.start || !utr.end) { + throw new Error( + `UTR has undefined start and/or end\n: ${JSON.stringify(utr, null, 2)}`, + ) + } + let strand: 1 | -1 | undefined = undefined + if (utr.strand === '+') { + strand = 1 + } else if (utr.strand === '-') { + strand = -1 + } + utrExons.push({ + _id: new ObjectID().toHexString(), + refSeq, + type: 'exon', + min: utr.start - 1, + max: utr.end, + strand, + }) + } + } + utrExons.sort((a, b) => a.min - b.min) + + const missingExons: AnnotationFeatureSnapshot[] = [] + for (const protein of cdsFeatures) { + protein.sort((a, b) => { + if (!a.start || !b.start) { + throw new Error('CDS has undefined start') + } + return a.start - b.start + }) + for (let cdsIdx = 0; cdsIdx < protein.length; cdsIdx++) { + const cds = protein[cdsIdx] + // For CDS check if there is an exon containing it. If not, create an exon with same coords as the CDS. + let exonFound = false + for (const x of existingExons) { + if (x.length != 1) { + throw new Error('Unexpected number of exons') + } + const [exon] = x + if ( + exon.start && + exon.end && + cds.start && + cds.end && + exon.start <= cds.start && + exon.end >= cds.end + ) { + exonFound = true + break + } + } + if (!exonFound) { + if (!cds.start || !cds.end) { + throw new Error( + `CDS has undefined start and/or end: ${JSON.stringify(cds, null, 2)}`, + ) + } + let strand: 1 | -1 | undefined = undefined + if (cds.strand === '+') { + strand = 1 + } else if (cds.strand === '-') { + strand = -1 + } + const newExon: AnnotationFeatureSnapshot = { + _id: new ObjectID().toHexString(), + refSeq, + type: 'exon', + min: cds.start - 1, + max: cds.end, + strand, + } + if (cdsIdx === 0) { + // If this CDS is the leftmost (or the only CDS in this protein), check if we need to add UTRs before it + for (const utr of utrExons) { + if (utr.max > newExon.min) { + break + } + if (utr.max === newExon.min) { + // UTR ends where exon begins: Extend the exon to include this UTR + newExon.min = utr.min + } else { + missingExons.push(utr) + } + } + } + if (cdsIdx === protein.length - 1) { + // If this CDS is the rightmost (or the only CDS in this protein), check if we need to add UTRs after it + for (const utr of utrExons) { + if (utr.min < newExon.max) { + continue + } + if (utr.min === newExon.max) { + // UTR begins where exon end: Extend the exon to include this UTR + newExon.max = utr.max + } else { + missingExons.push(utr) + } + } + } + missingExons.push(newExon) + } + } + } + const mergedExons = mergeAnnotationFeatures(missingExons) + return mergedExons +} + +function mergeAnnotationFeatures( + features: AnnotationFeatureSnapshot[], +): AnnotationFeatureSnapshot[] { + if (features.length === 0) { + return [] + } + features.sort((a, b) => a.min - b.min) + + const res = [] + res.push(features[0]) + + for (let i = 1; i < features.length; i++) { + const last = res.at(-1) + const curr = features[i] + + // If current interval overlaps with the last merged interval, merge them + if (last && curr.min <= last.max) { + last.max = Math.max(last.max, curr.max) + } else { + res.push(curr) + } + } + return res +} + /** * If a GFF3 file has CDS features that either (1) don't have an ID or (2) have * different IDs for each CDS, we have to do a bit of guessing about how they diff --git a/packages/apollo-shared/test_data/cds_without_exon.gff b/packages/apollo-shared/test_data/cds_without_exon.gff new file mode 100644 index 000000000..49a26b0b0 --- /dev/null +++ b/packages/apollo-shared/test_data/cds_without_exon.gff @@ -0,0 +1,12 @@ +##gff-version 3 +##sequence-region 53cb6b9b4f4ddef1ad47f943 1050 9000 +ctgA example gene 1050 9000 . + . ID=eden +ctgA example mRNA 1050 9000 . + . ID=eden.1;Parent=eden +ctgA example five_prime_UTR 1050 1210 . + 0 ID=five1;Parent=eden.1 +ctgA example CDS 1211 1510 . + 0 ID=cds2;Parent=eden.1 +ctgA example CDS 1611 1710 . + 0 ID=cds2;Parent=eden.1 +ctgA example three_prime_UTR 1711 1800 . + 0 ID=three1;Parent=eden.1 +ctgA example exon 1201 1500 . + 0 ID=exon1;Parent=eden.1 +ctgA example CDS 1601 1700 . + 0 ID=cds1;Parent=eden.1 +ctgA example CDS 1201 1500 . + 0 ID=cds1;Parent=eden.1 +ctgA example TF_binding_site 1050 1100 . + . Parent=eden diff --git a/packages/apollo-shared/test_data/cds_without_exon.json b/packages/apollo-shared/test_data/cds_without_exon.json new file mode 100644 index 000000000..849544ef8 --- /dev/null +++ b/packages/apollo-shared/test_data/cds_without_exon.json @@ -0,0 +1,74 @@ +{ + "_id": "67581b7d5890a8eb1bedab6e", + "refSeq": "ctgA", + "type": "gene", + "min": 1049, + "max": 9000, + "strand": 1, + "children": { + "67581b7d5890a8eb1bedab6c": { + "_id": "67581b7d5890a8eb1bedab6c", + "refSeq": "ctgA", + "type": "mRNA", + "min": 1049, + "max": 9000, + "strand": 1, + "children": { + "67581b7d5890a8eb1bedab66": { + "_id": "67581b7d5890a8eb1bedab66", + "refSeq": "ctgA", + "type": "exon", + "min": 1200, + "max": 1500, + "strand": 1, + "attributes": { "gff_source": ["example"], "gff_id": ["exon1"] } + }, + "67581b7d5890a8eb1bedab67": { + "_id": "67581b7d5890a8eb1bedab67", + "refSeq": "ctgA", + "type": "CDS", + "min": 1210, + "max": 1710, + "strand": 1, + "attributes": { "gff_source": ["example"], "gff_id": ["cds2"] } + }, + "67581b7d5890a8eb1bedab68": { + "_id": "67581b7d5890a8eb1bedab68", + "refSeq": "ctgA", + "type": "CDS", + "min": 1200, + "max": 1700, + "strand": 1, + "attributes": { "gff_source": ["example"], "gff_id": ["cds1"] } + }, + "67581b7d5890a8eb1bedab69": { + "_id": "67581b7d5890a8eb1bedab69", + "refSeq": "ctgA", + "type": "exon", + "min": 1049, + "max": 1510, + "strand": 1 + }, + "67581b7d5890a8eb1bedab6b": { + "_id": "67581b7d5890a8eb1bedab6b", + "refSeq": "ctgA", + "type": "exon", + "min": 1600, + "max": 1800, + "strand": 1 + } + }, + "attributes": { "gff_source": ["example"], "gff_id": ["eden.1"] } + }, + "67581b7d5890a8eb1bedab6d": { + "_id": "67581b7d5890a8eb1bedab6d", + "refSeq": "ctgA", + "type": "TF_binding_site", + "min": 1049, + "max": 1100, + "strand": 1, + "attributes": { "gff_source": ["example"] } + } + }, + "attributes": { "gff_source": ["example"], "gff_id": ["eden"] } +} diff --git a/packages/apollo-shared/test_data/cds_without_exon_spliced_utr.gff b/packages/apollo-shared/test_data/cds_without_exon_spliced_utr.gff new file mode 100644 index 000000000..4e29b0854 --- /dev/null +++ b/packages/apollo-shared/test_data/cds_without_exon_spliced_utr.gff @@ -0,0 +1,14 @@ +##gff-version 3 +chr1 . gene 1000 9000 . + . ID=gene00001 +#chr1 . mRNA 1300 9000 . + . ID=mRNA00001;Parent=gene00001 +#chr1 . five_prime_UTR 3000 3300 . + . Parent=mRNA00001 +#chr1 . CDS 5000 5500 . + 1 ID=cds00001;Parent=mRNA00001 +#chr1 . three_prime_UTR 7601 8000 . + . Parent=mRNA00001 +chr1 . mRNA 1300 9000 . + . ID=mRNA00003;Parent=gene00001 +chr1 . five_prime_UTR 1300 1500 . + . Parent=mRNA00003 +chr1 . five_prime_UTR 3000 3300 . + . Parent=mRNA00003 +chr1 . CDS 3301 3902 . + 0 ID=cds00003;Parent=mRNA00003 +chr1 . CDS 5000 5500 . + 1 ID=cds00003;Parent=mRNA00003 +chr1 . CDS 7000 7600 . + 2 ID=cds00003;Parent=mRNA00003 +chr1 . three_prime_UTR 7601 8000 . + . Parent=mRNA00003 +chr1 . three_prime_UTR 8501 8900 . + . Parent=mRNA00003 diff --git a/packages/apollo-shared/test_data/cds_without_exon_spliced_utr.json b/packages/apollo-shared/test_data/cds_without_exon_spliced_utr.json new file mode 100644 index 000000000..ceb7b7edb --- /dev/null +++ b/packages/apollo-shared/test_data/cds_without_exon_spliced_utr.json @@ -0,0 +1,73 @@ +{ + "_id": "675ad4e6a5abb3a5087c0652", + "refSeq": "chr1", + "type": "gene", + "min": 999, + "max": 9000, + "strand": 1, + "children": { + "675ad4e6a5abb3a5087c0651": { + "_id": "675ad4e6a5abb3a5087c0651", + "refSeq": "chr1", + "type": "mRNA", + "min": 1299, + "max": 9000, + "strand": 1, + "children": { + "675ad4e6a5abb3a5087c0649": { + "_id": "675ad4e6a5abb3a5087c0649", + "refSeq": "chr1", + "type": "CDS", + "min": 3300, + "max": 7600, + "strand": 1, + "attributes": { + "gff_id": ["cds00003"] + } + }, + "675ad4e6a5abb3a5087c064a": { + "_id": "675ad4e6a5abb3a5087c064a", + "refSeq": "chr1", + "type": "exon", + "min": 1299, + "max": 1500, + "strand": 1 + }, + "675ad4e6a5abb3a5087c064e": { + "_id": "675ad4e6a5abb3a5087c064e", + "refSeq": "chr1", + "type": "exon", + "min": 2999, + "max": 3902, + "strand": 1 + }, + "675ad4e6a5abb3a5087c064f": { + "_id": "675ad4e6a5abb3a5087c064f", + "refSeq": "chr1", + "type": "exon", + "min": 4999, + "max": 5500, + "strand": 1 + }, + "675ad4e6a5abb3a5087c0650": { + "_id": "675ad4e6a5abb3a5087c0650", + "refSeq": "chr1", + "type": "exon", + "min": 6999, + "max": 8000, + "strand": 1 + }, + "675ad4e6a5abb3a5087c064d": { + "_id": "675ad4e6a5abb3a5087c064d", + "refSeq": "chr1", + "type": "exon", + "min": 8500, + "max": 8900, + "strand": 1 + } + }, + "attributes": { "gff_id": ["mRNA00003"] } + } + }, + "attributes": { "gff_id": ["gene00001"] } +} diff --git a/packages/apollo-shared/test_data/gene_representations.gff3 b/packages/apollo-shared/test_data/gene_representations.gff3 index b9aff4134..06f6c275b 100644 --- a/packages/apollo-shared/test_data/gene_representations.gff3 +++ b/packages/apollo-shared/test_data/gene_representations.gff3 @@ -1,4 +1,5 @@ ##gff-version 3 +##sequence-region chr1 1000 39000 # example 1 chr1 . gene 1000 9000 . + . ID=gene10001;Name=EDEN chr1 . TF_binding_site 1000 1012 . + . ID=tfbs10001;Parent=gene10001 @@ -76,10 +77,10 @@ chr1 . CDS 21201 21500 . + 0 ID=cds30005;Parent=mRNA30002;Name=edenprotein.2 chr1 . CDS 25000 25500 . + 0 ID=cds30006;Parent=mRNA30002;Name=edenprotein.2 chr1 . CDS 27000 27600 . + 0 ID=cds30007;Parent=mRNA30002;Name=edenprotein.2 chr1 . CDS 23301 23902 . + 0 ID=cds30008;Parent=mRNA30003;Name=edenprotein.3 -chr1 . CDS 25000 25500 . + 1 ID=cds30009;Parent=mRNA30003;Name=edenprotein.3 -chr1 . CDS 27000 27600 . + 1 ID=cds30010;Parent=mRNA30003;Name=edenprotein.3 -chr1 . CDS 23391 23902 . + 0 ID=cds30011;Parent=mRNA30003;Name=edenprotein.4 -chr1 . CDS 25000 25500 . + 1 ID=cds30012;Parent=mRNA30003;Name=edenprotein.4 +chr1 . CDS 25000 25500 . + 2 ID=cds30009;Parent=mRNA30003;Name=edenprotein.3 +chr1 . CDS 27000 27600 . + 2 ID=cds30010;Parent=mRNA30003;Name=edenprotein.3 +chr1 . CDS 23391 23902 . + 1 ID=cds30011;Parent=mRNA30003;Name=edenprotein.4 +chr1 . CDS 25000 25500 . + 2 ID=cds30012;Parent=mRNA30003;Name=edenprotein.4 chr1 . CDS 27000 27600 . + 1 ID=cds30013;Parent=mRNA30003;Name=edenprotein.4 # example 4 chr1 . gene 31000 39000 . + . ID=gene40001;Name=EDEN diff --git a/packages/apollo-shared/test_data/onecds_without_exon_spliced_utr.gff b/packages/apollo-shared/test_data/onecds_without_exon_spliced_utr.gff new file mode 100644 index 000000000..7d469cc05 --- /dev/null +++ b/packages/apollo-shared/test_data/onecds_without_exon_spliced_utr.gff @@ -0,0 +1,6 @@ +##gff-version 3 +chr1 . gene 1000 9000 . + . ID=gene00001 +chr1 . mRNA 1300 9000 . + . ID=mRNA00001;Parent=gene00001 +chr1 . five_prime_UTR 3000 3300 . + . Parent=mRNA00001 +chr1 . CDS 5000 5500 . + 1 ID=cds00001;Parent=mRNA00001 +chr1 . three_prime_UTR 7601 8000 . + . Parent=mRNA00001 diff --git a/packages/apollo-shared/test_data/onecds_without_exon_spliced_utr.json b/packages/apollo-shared/test_data/onecds_without_exon_spliced_utr.json new file mode 100644 index 000000000..505692dde --- /dev/null +++ b/packages/apollo-shared/test_data/onecds_without_exon_spliced_utr.json @@ -0,0 +1,55 @@ +{ + "_id": "675af35f5758c90ab1d55838", + "refSeq": "chr1", + "type": "gene", + "min": 999, + "max": 9000, + "strand": 1, + "children": { + "675af35f5758c90ab1d55837": { + "_id": "675af35f5758c90ab1d55837", + "refSeq": "chr1", + "type": "mRNA", + "min": 1299, + "max": 9000, + "strand": 1, + "children": { + "675af35f5758c90ab1d55833": { + "_id": "675af35f5758c90ab1d55833", + "refSeq": "chr1", + "type": "CDS", + "min": 4999, + "max": 5500, + "strand": 1, + "attributes": { "gff_id": ["cds00001"] } + }, + "675af35f5758c90ab1d55834": { + "_id": "675af35f5758c90ab1d55834", + "refSeq": "chr1", + "type": "exon", + "min": 2999, + "max": 3300, + "strand": 1 + }, + "675af35f5758c90ab1d55836": { + "_id": "675af35f5758c90ab1d55836", + "refSeq": "chr1", + "type": "exon", + "min": 4999, + "max": 5500, + "strand": 1 + }, + "675af35f5758c90ab1d55835": { + "_id": "675af35f5758c90ab1d55835", + "refSeq": "chr1", + "type": "exon", + "min": 7600, + "max": 8000, + "strand": 1 + } + }, + "attributes": { "gff_id": ["mRNA00001"] } + } + }, + "attributes": { "gff_id": ["gene00001"] } +} diff --git a/packages/jbrowse-plugin-apollo/cypress/e2e/editFeature.cy.ts b/packages/jbrowse-plugin-apollo/cypress/e2e/editFeature.cy.ts index 1db38ee35..e8ea491f0 100644 --- a/packages/jbrowse-plugin-apollo/cypress/e2e/editFeature.cy.ts +++ b/packages/jbrowse-plugin-apollo/cypress/e2e/editFeature.cy.ts @@ -239,7 +239,7 @@ describe('Different ways of editing features', () => { cy.reload() // Ideally, you shouldn't need to reload to see the change? cy.get('tbody', { timeout: 60_000 }).within(() => { cy.get('input[value="start_codon"]').should('have.length', 1) - cy.get('input[value="1"]').should('have.length', 4) + cy.get('input[value="1"]').should('have.length', 5) cy.get('input[value="3"]').should('have.length', 1) }) })