Merge pull request #4 from katapod/feature_xml_to_json_parser

Feature: XML to JSON parser
katapod · Aug 24, 2023 · 646cb4b · 646cb4b
2 parents 9c443c7 + ab3d0dc
commit 646cb4b
Show file tree

Hide file tree

Showing 7 changed files with 3,767 additions and 5,000 deletions.
diff --git a/package.json b/package.json
@@ -41,6 +41,7 @@
   "homepage": "https://github.com/Aerilym/mini-xml",
   "devDependencies": {
     "@tsconfig/node16": "^1.0.4",
+    "@types/dom-parser": "^0.1.1",
     "@types/jest": "^29.5.1",
     "@types/node": "^20.2.5",
     "@typescript-eslint/eslint-plugin": "^5.59.8",
@@ -58,5 +59,8 @@
     "ts-node": "^10.9.1",
     "typedoc": "^0.24.7",
     "typescript": "^5.0.4"
+  },
+  "dependencies": {
+    "dom-parser": "^0.1.6"
   }
 }
diff --git a/src/index.test.ts b/src/index.test.ts
@@ -3,4 +3,8 @@ describe('index', () => {
     const { generateXMLFromObject } = require('./index');
     expect(generateXMLFromObject).toBeDefined();
   });
+  it('should export generateObjectFromXML', () => {
+    const { generateObjectFromXML } = require('./index');
+    expect(generateObjectFromXML).toBeDefined();
+  });
 });
diff --git a/src/index.ts b/src/index.ts
@@ -1 +1,2 @@
 export { generateXMLFromObject } from './builder';
+export { generateObjectFromXML } from './parser';
diff --git a/src/parser.test.ts b/src/parser.test.ts
diff --git a/src/parser.ts b/src/parser.ts
@@ -0,0 +1,133 @@
+import DomParser, { Node } from 'dom-parser';
+
+interface CleanNode extends Node {
+  jsonName: string;
+}
+
+/**
+ * Add additional properties to the node to make it easier to work with.
+ * @param node - The node to clean.
+ * @returns The cleaned node.
+ */
+function cleanNode(node: Node): CleanNode {
+  const cleanNode = node as CleanNode;
+  const sterilizedNode = JSON.stringify(cleanNode);
+  const namespace: string | null = JSON.parse(sterilizedNode).namespace;
+  if (namespace) {
+    cleanNode.jsonName = `${cleanNode.nodeName}:${namespace}`;
+  } else {
+    cleanNode.jsonName = cleanNode.nodeName;
+  }
+
+  return cleanNode;
+}
+
+/**
+ * Generates a JSON object from an XML string.
+ * @param xmlString - The XML string to parse.
+ * @returns A JSON object representing the XML.
+ */
+export function generateObjectFromXML(xmlString: string): XMLObject {
+  const parser = new DomParser();
+  const xmlDoc = parser.parseFromString(xmlString);
+
+  // Extract the CDATA from the XML. This is a hacky way to do it, but it works. We can rebuild the CDATA but the XML parser doesn't like the broken <br> tags.
+  const rawCDATA = xmlString.match(/<!\[CDATA\[(.*?)\]\]>/g);
+
+  // Extract the link tags from the XML. This is a hacky way to do it, but it works. The parser doesn't like the <link></link> tags and seems to remove their text content and move them to the parent node.
+  const rawLinkTags = xmlString
+    .match(/<link>(.*?)<\/link>/g)
+    ?.map((tag) => tag.replace('<link>', '').replace('</link>', ''));
+
+  const textLinks = [...(rawLinkTags ?? [])];
+
+  const itunesNewFeedUrl = xmlString
+    .match(/<itunes:new-feed-url>(.*?)<\/itunes:new-feed-url>/g)
+    ?.map((tag) => tag.replace('<itunes:new-feed-url>', '').replace('</itunes:new-feed-url>', ''));
+
+  /**
+   * Converts a node to a node for the json object
+   * @param rawNode - The node to convert.
+   * @returns The converted node.
+   */
+  function xmlToJson(rawNode: Node): any {
+    const data: any = {};
+
+    const node = cleanNode(rawNode);
+
+    // TODO: Investigate parsing the link tags properly.
+    if (node.jsonName === 'link') {
+      const text = rawLinkTags?.shift();
+      if (Object.keys(data).length === 0) return text;
+    }
+
+    const attributes = node.attributes as unknown as Array<{ name: string; value: string }>;
+    if (attributes && attributes.length > 0) {
+      for (let i = 0; i < attributes.length; i++) {
+        const attribute = attributes[i];
+
+        data[`@${attribute.name}`] = attribute.value;
+      }
+    }
+
+    if (
+      !node.childNodes ||
+      node.childNodes.length === 0 ||
+      (node.childNodes.length === 1 && node.childNodes[0].textContent)
+    ) {
+      let text: string | undefined = node.textContent.trim();
+
+      if (text === '') text = undefined;
+      else if (textLinks.includes(text)) text = undefined;
+
+      if (Object.keys(data).length === 0) return text;
+      if (text === undefined) return data;
+
+      data['#text'] = text;
+      return data;
+    }
+
+    if (node.childNodes[0].textContent.trim() === '<![CDATA[') {
+      //TODO: Investigate fixing this. It breaks as <br> tags are not closed. and the parser doesn't like it.
+      /* node.childNodes.splice(0, 1);
+      node.childNodes.splice(node.childNodes.length - 1, 1);
+      const cdataJson = xmlToJson(node);
+      const cdataXML = generateXMLFromObject(cdataJson, { excludeXMLHeader: true, pretty: false }); */
+      data['#cdata'] = rawCDATA?.shift()?.replace('<![CDATA[', '').replace(']]>', '');
+      return data;
+    }
+
+    for (let i = 0; i < node.childNodes.length; i++) {
+      const child = cleanNode(node.childNodes[i]);
+
+      const jsonNode = xmlToJson(child);
+      if (jsonNode === undefined) continue;
+      else if (jsonNode.jsonName === '#text' && textLinks.includes(jsonNode)) continue;
+      //TODO: Remove this once we fix the hyphen tag name issue.
+      else if (
+        node.jsonName === 'channel' &&
+        typeof jsonNode === 'string' &&
+        jsonNode.includes('new-feed-url')
+      ) {
+        if (itunesNewFeedUrl !== undefined && itunesNewFeedUrl.length > 0) {
+          data['itunes:new-feed-url'] = itunesNewFeedUrl[0];
+        }
+        continue;
+      }
+      if (data[child.jsonName] === undefined) {
+        data[child.jsonName] = jsonNode;
+      } else {
+        if (!Array.isArray(data[child.jsonName])) {
+          data[child.jsonName] = [data[child.jsonName]];
+        }
+        data[child.jsonName].push(jsonNode);
+      }
+    }
+
+    return data;
+  }
+
+  const node = xmlDoc.getElementsByTagName('channel');
+  if (!node) throw new Error('No channel node found');
+  return xmlToJson(node[0]);
+}