GMOD · garrettjstevens · Jan 23, 2025 · Nov 26, 2024 · Nov 27, 2024 · Nov 27, 2024
diff --git a/packages/apollo-shared/src/GFF3/gff3ToAnnotationFeature.test.ts b/packages/apollo-shared/src/GFF3/gff3ToAnnotationFeature.test.ts
@@ -197,6 +197,37 @@ describe('gff3ToAnnotationFeature examples', () => {
   })
 })
 
+describe('CDS without exons', () => {
+  it('Convert mRNA with CDS but without exon', () => {
+    const [gffFeature] = readFeatureFile('test_data/cds_without_exon.gff')
+    const actual = gff3ToAnnotationFeature(gffFeature)
+    const expected = readAnnotationFeatureSnapshot(
+      'test_data/cds_without_exon.json',
+    )
+    compareFeatures(actual, expected)
+  })
+  it('Convert mRNA with CDS but without exon and spliced UTR', () => {
+    const [gffFeature] = readFeatureFile(
+      'test_data/cds_without_exon_spliced_utr.gff',
+    )
+    const actual = gff3ToAnnotationFeature(gffFeature)
+    const expected = readAnnotationFeatureSnapshot(
+      'test_data/cds_without_exon_spliced_utr.json',
+    )
+    compareFeatures(actual, expected)
+  })
+  it('Convert mRNA with one CDS, without exons non-adjacent UTR', () => {
+    const [gffFeature] = readFeatureFile(
+      'test_data/onecds_without_exon_spliced_utr.gff',
+    )
+    const actual = gff3ToAnnotationFeature(gffFeature)
+    const expected = readAnnotationFeatureSnapshot(
+      'test_data/onecds_without_exon_spliced_utr.json',
+    )
+    compareFeatures(actual, expected)
+  })
+})
+
 describe('gff3ToAnnotationFeature', () => {
   for (const testCase of testCases) {
     const [description, featureLine, convertedFeature] = testCase

diff --git a/packages/apollo-shared/src/GFF3/gff3ToAnnotationFeature.ts b/packages/apollo-shared/src/GFF3/gff3ToAnnotationFeature.ts
@@ -157,8 +157,19 @@ function convertChildren(
   const { child_features: childFeatures } = firstFeature
 
   const cdsFeatures: GFF3Feature[] = []
+  const exonFeatures: GFF3Feature[] = []
+  const utrFeatures: GFF3Feature[] = []
   for (const childFeature of childFeatures) {
     const [firstChildFeatureLocation] = childFeature
+    if (firstChildFeatureLocation.type === 'exon') {
+      exonFeatures.push(childFeature)
+    }
+    if (
+      firstChildFeatureLocation.type === 'three_prime_UTR' ||
+      firstChildFeatureLocation.type === 'five_prime_UTR'
+    ) {
+      utrFeatures.push(childFeature)
+    }
     if (
       firstChildFeatureLocation.type === 'three_prime_UTR' ||
       firstChildFeatureLocation.type === 'five_prime_UTR' ||
@@ -175,10 +186,23 @@ function convertChildren(
       convertedChildren[child._id] = child
     }
   }
-  const processedCDS =
-    cdsFeatures.length > 0 ? processCDS(cdsFeatures, refSeq, featureIds) : []
-  for (const cds of processedCDS) {
-    convertedChildren[cds._id] = cds
+
+  if (cdsFeatures.length > 0) {
+    const processedCDS = processCDS(cdsFeatures, refSeq, featureIds)
+
+    for (const cds of processedCDS) {
+      convertedChildren[cds._id] = cds
+    }
+
+    const missingExons = inferMissingExons(
+      cdsFeatures,
+      exonFeatures,
+      utrFeatures,
+      processedCDS[0].refSeq,
+    )
+    for (const exon of missingExons) {
+      convertedChildren[exon._id] = exon
+    }
   }
 
   if (Object.keys(convertedChildren).length > 0) {
@@ -187,6 +211,149 @@ function convertChildren(
   return
 }
 
+function inferMissingExons(
+  cdsFeatures: GFF3Feature[],
+  existingExons: GFF3Feature[],
+  utrFeatures: GFF3Feature[],
+  refSeq: string,
+): AnnotationFeatureSnapshot[] {
+  // Convert utrFeatures from GFF3Feature to AnnotationFeatureSnapshot
+  const utrExons: AnnotationFeatureSnapshot[] = []
+  for (const utrs of utrFeatures) {
+    for (const utr of utrs) {
+      if (!utr.start || !utr.end) {
+        throw new Error(
+          `UTR has undefined start and/or end\n: ${JSON.stringify(utr, null, 2)}`,
+        )
+      }
+      let strand: 1 | -1 | undefined = undefined
+      if (utr.strand === '+') {
+        strand = 1
+      } else if (utr.strand === '-') {
+        strand = -1
+      }
+      utrExons.push({
+        _id: new ObjectID().toHexString(),
+        refSeq,
+        type: 'exon',
+        min: utr.start - 1,
+        max: utr.end,
+        strand,
+      })
+    }
+  }
+  utrExons.sort((a, b) => a.min - b.min)
+
+  const missingExons: AnnotationFeatureSnapshot[] = []
+  for (const protein of cdsFeatures) {
+    protein.sort((a, b) => {
+      if (!a.start || !b.start) {
+        throw new Error('CDS has undefined start')
+      }
+      return a.start - b.start
+    })
+    for (let cdsIdx = 0; cdsIdx < protein.length; cdsIdx++) {
+      const cds = protein[cdsIdx]
+      // For CDS check if there is an exon containing it. If not, create an exon with same coords as the CDS.
+      let exonFound = false
+      for (const x of existingExons) {
+        if (x.length != 1) {
+          throw new Error('Unexpected number of exons')
+        }
+        const [exon] = x
+        if (
+          exon.start &&
+          exon.end &&
+          cds.start &&
+          cds.end &&
+          exon.start <= cds.start &&
+          exon.end >= cds.end
+        ) {
+          exonFound = true
+          break
+        }
+      }
+      if (!exonFound) {
+        if (!cds.start || !cds.end) {
+          throw new Error(
+            `CDS has undefined start and/or end: ${JSON.stringify(cds, null, 2)}`,
+          )
+        }
+        let strand: 1 | -1 | undefined = undefined
+        if (cds.strand === '+') {
+          strand = 1
+        } else if (cds.strand === '-') {
+          strand = -1
+        }
+        const newExon: AnnotationFeatureSnapshot = {
+          _id: new ObjectID().toHexString(),
+          refSeq,
+          type: 'exon',
+          min: cds.start - 1,
+          max: cds.end,
+          strand,
+        }
+        if (cdsIdx === 0) {
+          // If this CDS is the leftmost (or the only CDS in this protein), check if we need to add UTRs before it
+          for (const utr of utrExons) {
+            if (utr.max > newExon.min) {
+              break
+            }
+            if (utr.max === newExon.min) {
+              // UTR ends where exon begins: Extend the exon to include this UTR
+              newExon.min = utr.min
+            } else {
+              missingExons.push(utr)
+            }
+          }
+        }
+        if (cdsIdx === protein.length - 1) {
+          // If this CDS is the rightmost (or the only CDS in this protein), check if we need to add UTRs after it
+          for (const utr of utrExons) {
+            if (utr.min < newExon.max) {
+              continue
+            }
+            if (utr.min === newExon.max) {
+              // UTR begins where exon end: Extend the exon to include this UTR
+              newExon.max = utr.max
+            } else {
+              missingExons.push(utr)
+            }
+          }
+        }
+        missingExons.push(newExon)
+      }
+    }
+  }
+  const mergedExons = mergeAnnotationFeatures(missingExons)
+  return mergedExons
+}
+
+function mergeAnnotationFeatures(
+  features: AnnotationFeatureSnapshot[],
+): AnnotationFeatureSnapshot[] {
+  if (features.length === 0) {
+    return []
+  }
+  features.sort((a, b) => a.min - b.min)
+
+  const res = []
+  res.push(features[0])
+
+  for (let i = 1; i < features.length; i++) {
+    const last = res.at(-1)
+    const curr = features[i]
+
+    // If current interval overlaps with the last merged interval, merge them
+    if (last && curr.min <= last.max) {
+      last.max = Math.max(last.max, curr.max)
+    } else {
+      res.push(curr)
+    }
+  }
+  return res
+}
+
 /**
  * If a GFF3 file has CDS features that either (1) don't have an ID or (2) have
  * different IDs for each CDS, we have to do a bit of guessing about how they

diff --git a/packages/apollo-shared/test_data/cds_without_exon.gff b/packages/apollo-shared/test_data/cds_without_exon.gff
@@ -0,0 +1,12 @@
+##gff-version 3
+##sequence-region	53cb6b9b4f4ddef1ad47f943	1050	9000
+ctgA	example	gene	1050	9000	.	+	.	ID=eden
+ctgA	example	mRNA	1050	9000	.	+	.	ID=eden.1;Parent=eden
+ctgA	example	five_prime_UTR	1050	1210	.	+	0	ID=five1;Parent=eden.1
+ctgA	example	CDS	1211	1510	.	+	0	ID=cds2;Parent=eden.1
+ctgA	example	CDS	1611	1710	.	+	0	ID=cds2;Parent=eden.1
+ctgA	example	three_prime_UTR	1711	1800	.	+	0	ID=three1;Parent=eden.1
+ctgA	example	exon	1201	1500	.	+	0	ID=exon1;Parent=eden.1
+ctgA	example	CDS	1601	1700	.	+	0	ID=cds1;Parent=eden.1
+ctgA	example	CDS	1201	1500	.	+	0	ID=cds1;Parent=eden.1
+ctgA	example	TF_binding_site	1050	1100	.	+	.	Parent=eden
diff --git a/packages/apollo-shared/test_data/cds_without_exon.json b/packages/apollo-shared/test_data/cds_without_exon.json
@@ -0,0 +1,74 @@
+{
+  "_id": "67581b7d5890a8eb1bedab6e",
+  "refSeq": "ctgA",
+  "type": "gene",
+  "min": 1049,
+  "max": 9000,
+  "strand": 1,
+  "children": {
+    "67581b7d5890a8eb1bedab6c": {
+      "_id": "67581b7d5890a8eb1bedab6c",
+      "refSeq": "ctgA",
+      "type": "mRNA",
+      "min": 1049,
+      "max": 9000,
+      "strand": 1,
+      "children": {
+        "67581b7d5890a8eb1bedab66": {
+          "_id": "67581b7d5890a8eb1bedab66",
+          "refSeq": "ctgA",
+          "type": "exon",
+          "min": 1200,
+          "max": 1500,
+          "strand": 1,
+          "attributes": { "gff_source": ["example"], "gff_id": ["exon1"] }
+        },
+        "67581b7d5890a8eb1bedab67": {
+          "_id": "67581b7d5890a8eb1bedab67",
+          "refSeq": "ctgA",
+          "type": "CDS",
+          "min": 1210,
+          "max": 1710,
+          "strand": 1,
+          "attributes": { "gff_source": ["example"], "gff_id": ["cds2"] }
+        },
+        "67581b7d5890a8eb1bedab68": {
+          "_id": "67581b7d5890a8eb1bedab68",
+          "refSeq": "ctgA",
+          "type": "CDS",
+          "min": 1200,
+          "max": 1700,
+          "strand": 1,
+          "attributes": { "gff_source": ["example"], "gff_id": ["cds1"] }
+        },
+        "67581b7d5890a8eb1bedab69": {
+          "_id": "67581b7d5890a8eb1bedab69",
+          "refSeq": "ctgA",
+          "type": "exon",
+          "min": 1049,
+          "max": 1510,
+          "strand": 1
+        },
+        "67581b7d5890a8eb1bedab6b": {
+          "_id": "67581b7d5890a8eb1bedab6b",
+          "refSeq": "ctgA",
+          "type": "exon",
+          "min": 1600,
+          "max": 1800,
+          "strand": 1
+        }
+      },
+      "attributes": { "gff_source": ["example"], "gff_id": ["eden.1"] }
+    },
+    "67581b7d5890a8eb1bedab6d": {
+      "_id": "67581b7d5890a8eb1bedab6d",
+      "refSeq": "ctgA",
+      "type": "TF_binding_site",
+      "min": 1049,
+      "max": 1100,
+      "strand": 1,
+      "attributes": { "gff_source": ["example"] }
+    }
+  },
+  "attributes": { "gff_source": ["example"], "gff_id": ["eden"] }
+}
diff --git a/packages/apollo-shared/test_data/cds_without_exon_spliced_utr.gff b/packages/apollo-shared/test_data/cds_without_exon_spliced_utr.gff
@@ -0,0 +1,14 @@
+##gff-version 3
+chr1	.	gene	1000	9000	.	+	.	ID=gene00001
+#chr1	.	mRNA	1300	9000	.	+	.	ID=mRNA00001;Parent=gene00001
+#chr1	.	five_prime_UTR	3000	3300	.	+	.	Parent=mRNA00001
+#chr1	.	CDS	5000	5500	.	+	1	ID=cds00001;Parent=mRNA00001
+#chr1	.	three_prime_UTR	7601	8000	.	+	.	Parent=mRNA00001
+chr1	.	mRNA	1300	9000	.	+	.	ID=mRNA00003;Parent=gene00001
+chr1	.	five_prime_UTR	1300	1500	.	+	.	Parent=mRNA00003
+chr1	.	five_prime_UTR	3000	3300	.	+	.	Parent=mRNA00003
+chr1	.	CDS	3301	3902	.	+	0	ID=cds00003;Parent=mRNA00003
+chr1	.	CDS	5000	5500	.	+	1	ID=cds00003;Parent=mRNA00003
+chr1	.	CDS	7000	7600	.	+	2	ID=cds00003;Parent=mRNA00003
+chr1	.	three_prime_UTR	7601	8000	.	+	.	Parent=mRNA00003
+chr1	.	three_prime_UTR	8501	8900	.	+	.	Parent=mRNA00003