From 8e4e107e91b66ba202f41ef296e1201ee47c9129 Mon Sep 17 00:00:00 2001 From: Sasikanth Miriyampalli Date: Fri, 9 Aug 2024 10:41:33 +0530 Subject: [PATCH] Refactor `HtmlContentParser` Uses `Ksoup#clean` API to clean up the content and extract text and lead image --- .../core/network/parser/AtomContentParser.kt | 2 +- .../core/network/parser/HtmlContentParser.kt | 58 ++++++++--------- .../core/network/parser/RDFContentParser.kt | 2 +- .../core/network/parser/RSSContentParser.kt | 2 +- .../network/parser/HtmlContentParserTest.kt | 63 +++++++++++++++++++ 5 files changed, 91 insertions(+), 36 deletions(-) create mode 100644 core/network/src/commonTest/kotlin/dev/sasikanth/rss/reader/core/network/parser/HtmlContentParserTest.kt diff --git a/core/network/src/commonMain/kotlin/dev/sasikanth/rss/reader/core/network/parser/AtomContentParser.kt b/core/network/src/commonMain/kotlin/dev/sasikanth/rss/reader/core/network/parser/AtomContentParser.kt index 37eebfb2b..615962a23 100644 --- a/core/network/src/commonMain/kotlin/dev/sasikanth/rss/reader/core/network/parser/AtomContentParser.kt +++ b/core/network/src/commonMain/kotlin/dev/sasikanth/rss/reader/core/network/parser/AtomContentParser.kt @@ -124,7 +124,7 @@ internal object AtomContentParser : ContentParser() { val htmlContent = HtmlContentParser.parse(htmlContent = rawContent) if (image.isNullOrBlank() && htmlContent != null) { - image = htmlContent.imageUrl + image = htmlContent.leadImage } content = htmlContent?.content?.ifBlank { rawContent.trim() } ?: rawContent.trim() diff --git a/core/network/src/commonMain/kotlin/dev/sasikanth/rss/reader/core/network/parser/HtmlContentParser.kt b/core/network/src/commonMain/kotlin/dev/sasikanth/rss/reader/core/network/parser/HtmlContentParser.kt index 3d12066a3..f7a13398a 100644 --- a/core/network/src/commonMain/kotlin/dev/sasikanth/rss/reader/core/network/parser/HtmlContentParser.kt +++ b/core/network/src/commonMain/kotlin/dev/sasikanth/rss/reader/core/network/parser/HtmlContentParser.kt @@ -17,36 +17,41 @@ package dev.sasikanth.rss.reader.core.network.parser import co.touchlab.crashkios.bugsnag.BugsnagKotlin import com.fleeksoft.ksoup.Ksoup +import com.fleeksoft.ksoup.safety.Safelist import io.ktor.utils.io.charsets.MalformedInputException internal object HtmlContentParser { - private val allowedContentTags = setOf("p", "span", "em", "u", "b", "i", "strong") + private const val TAG_BODY = "body" + private const val TAG_IMG = "img" + private const val TAG_FIGCAPTION = "figcaption" + private const val ATTR_SRC = "src" - fun parse(htmlContent: String): HtmlContent? { + private val allowedContentTags = + Safelist().addTags(TAG_FIGCAPTION, TAG_IMG).addAttributes(TAG_IMG, ATTR_SRC) + private val gifRegex by lazy { Regex("/\\.gif(\\?.*)?\\$/i") } + + fun parse(htmlContent: String): Result? { if (htmlContent.isBlank()) return null return try { - val document = Ksoup.parse(htmlContent) - - val imageUrl = - document - .getElementsByTag("img") - .firstOrNull { it.hasAttr("src") && !it.attr("src").endsWith(".gif") } - ?.attr("src") - - val contentStringBuilder = StringBuilder() - document.getAllElements().forEach { element -> - if (allowedContentTags.contains(element.tagName())) { - contentStringBuilder.append(element.text().cleanWhitespaces()) - } + val cleanedHtml = Ksoup.clean(htmlContent, allowedContentTags) + val document = Ksoup.parse(cleanedHtml) + val body = document.getElementsByTag(TAG_BODY).first() ?: return null + val elements = body.children() - if (element.tagName() == "p" || element.tagName() == "br") { - contentStringBuilder.appendLine() + val leadImage = + elements.firstNotNullOfOrNull { + val imageUrl = it.attr(ATTR_SRC) + if (it.tagName() == TAG_IMG && !gifRegex.containsMatchIn(imageUrl)) { + imageUrl.removeSurrounding("\"") + } else { + null + } } - } + val content = body.ownText() - HtmlContent(imageUrl = imageUrl, content = contentStringBuilder.toString()) + Result(leadImage = leadImage, content = content) } catch (e: Exception) { null } catch (e: MalformedInputException) { @@ -55,18 +60,5 @@ internal object HtmlContentParser { } } - private fun String.cleanWhitespaces(): String { - var formattedText = this.trim() - if (formattedText.isNotBlank()) { - if (this[0].isWhitespace()) { - formattedText = " $formattedText" - } - if (this.last().isWhitespace()) { - formattedText += " " - } - } - return formattedText - } - - data class HtmlContent(val imageUrl: String?, val content: String) + data class Result(val leadImage: String?, val content: String) } diff --git a/core/network/src/commonMain/kotlin/dev/sasikanth/rss/reader/core/network/parser/RDFContentParser.kt b/core/network/src/commonMain/kotlin/dev/sasikanth/rss/reader/core/network/parser/RDFContentParser.kt index 055ac3d49..caeb2d5a4 100644 --- a/core/network/src/commonMain/kotlin/dev/sasikanth/rss/reader/core/network/parser/RDFContentParser.kt +++ b/core/network/src/commonMain/kotlin/dev/sasikanth/rss/reader/core/network/parser/RDFContentParser.kt @@ -128,7 +128,7 @@ internal object RDFContentParser : ContentParser() { val htmlContent = HtmlContentParser.parse(htmlContent = rawContent) if (image.isNullOrBlank() && htmlContent != null) { - image = htmlContent.imageUrl + image = htmlContent.leadImage } description = htmlContent?.content?.ifBlank { rawContent.trim() } ?: rawContent.trim() diff --git a/core/network/src/commonMain/kotlin/dev/sasikanth/rss/reader/core/network/parser/RSSContentParser.kt b/core/network/src/commonMain/kotlin/dev/sasikanth/rss/reader/core/network/parser/RSSContentParser.kt index c13230cd0..f550c14a9 100644 --- a/core/network/src/commonMain/kotlin/dev/sasikanth/rss/reader/core/network/parser/RSSContentParser.kt +++ b/core/network/src/commonMain/kotlin/dev/sasikanth/rss/reader/core/network/parser/RSSContentParser.kt @@ -128,7 +128,7 @@ internal object RSSContentParser : ContentParser() { val htmlContent = HtmlContentParser.parse(htmlContent = rawContent) if (image.isNullOrBlank() && htmlContent != null) { - image = htmlContent.imageUrl + image = htmlContent.leadImage } description = htmlContent?.content?.ifBlank { rawContent.trim() } ?: rawContent.trim() diff --git a/core/network/src/commonTest/kotlin/dev/sasikanth/rss/reader/core/network/parser/HtmlContentParserTest.kt b/core/network/src/commonTest/kotlin/dev/sasikanth/rss/reader/core/network/parser/HtmlContentParserTest.kt new file mode 100644 index 000000000..97c253745 --- /dev/null +++ b/core/network/src/commonTest/kotlin/dev/sasikanth/rss/reader/core/network/parser/HtmlContentParserTest.kt @@ -0,0 +1,63 @@ +/* + * Copyright 2024 Sasikanth Miriyampalli + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package dev.sasikanth.rss.reader.core.network.parser + +import kotlin.test.Test +import kotlin.test.assertEquals +import kotlin.test.assertNull + +class HtmlContentParserTest { + + companion object { + private const val TEST_HTML = + """ +
+ A screenshot from DOOM + DOOM II. +
Image: Bethesda
+
+

If you haven’t played Doom or Doom II for a while — or ever — a new re-release that Bethesda surprise-dropped (sorta) on Thursday might be the perfect excuse to jump in to the classic games. The re-release, which combines both games into one package called Doom + Doom II and is a free update for anyone who already owns Doom (1993) or Doom II, offers a long list of great new features — including a brand new single-player episode and online, cross-platform deathmatch multiplayer.

+

With Doom + Doom II, you’ll have access to both of those two games as well as extra single-player content like John Romero’s Sigil episode released in 2019 and Legacy of Rust, which is a new Doom episode created by “individuals from id Software, Nightdive Studios...

+

Continue reading…

+ """ + } + + @Test + fun parsingLeadImageAndContentFromHtmlShouldWorkCorrectly() { + // when + val result = HtmlContentParser.parse(TEST_HTML) + + // then + assertEquals( + "https://cdn.vox-cdn.com/thumbor/LJt9a0BM9fnTyZtP68Ba1Mr1YDY=/150x0:1770x1080/1310x873/cdn.vox-cdn.com/uploads/chorus_image/image/73510530/ss_c5781b8f9a8181e6c989869b86d0b455ccca344a.0.jpg", + result?.leadImage, + ) + assertEquals( + "If you haven’t played Doom or Doom II for a while — or ever — a new re-release that Bethesda surprise-dropped (sorta) on Thursday might be the perfect excuse to jump in to the classic games. The re-release, which combines both games into one package called Doom + Doom II and is a free update for anyone who already owns Doom (1993) or Doom II, offers a long list of great new features — including a brand new single-player episode and online, cross-platform deathmatch multiplayer. With Doom + Doom II, you’ll have access to both of those two games as well as extra single-player content like John Romero’s Sigil episode released in 2019 and Legacy of Rust, which is a new Doom episode created by “individuals from id Software, Nightdive Studios... Continue reading…", + result?.content, + ) + } + + @Test + fun parsingContentFromTextShouldWorkCorrectly() { + // when + val result = HtmlContentParser.parse("This is a normal text") + + // then + assertNull(result?.leadImage) + assertEquals("This is a normal text", result?.content) + } +}