fix: fix handling of image and PDF links with special characters (#84)

Handling of other media files like audio is still broken and not worth the fix right now. From the attached GitHub issue > Aeons ago I made [a blog post](https://habi.gna.ch/2003/07/26/canyoning/) including the image `https://habi.gna.ch/blog/images/Picture(2).jpg` > > When calling `wp2hugo` with > ```bash > ./src/wp2hugo/bin/wp2hugo -source habignach.WordPress.2024-10-29.xml -download-media > ``` > the process fails with > ```bash > 05:33:50PM DBG hugo_gen_setup.go:412 > Embedded media links links=1 page=https://habi.gna.ch/2003/07/26/canyoning/ > 05:33:50PM DBG hugo_gen_setup.go:416 > Downloading media files links=1 > 05:33:50PM INF media_cache_setup.go:33 > media https://habi.gna.ch/blog/images/Picture(2 will be fetched > 05:33:50PM FTL main.go:43 > Error: error fetching media file https://habi.gna.ch/blog/images/Picture(2: error fetching media https://habi.gna.ch/blog/images/Picture(2: 404 Not Found > ``` > > I think the fetching link needs escaping of the parenthesis :) > > `grep "Picture(2" habignach.WordPress.2024-10-29.xml` returns `<a href="http://habi.gna.ch/blog/images/Picture(2).jpg"><img src="http://habi.gna.ch/blog/images/Picture(2)-tm.jpg" height="288" width="352" align="middle" border="2" hspace="0" vspace="0" alt="" longdesc="" /></a><p>` BTW Ref: #81
ashishb · Nov 18, 2024 · b1da435 · b1da435
1 parent 9adcb21
commit b1da435
Show file tree

Hide file tree

Showing 3 changed files with 49 additions and 4 deletions.
diff --git a/src/wp2hugo/go.mod b/src/wp2hugo/go.mod
@@ -9,6 +9,7 @@ require (
 	github.com/PuerkitoBio/goquery v1.10.0
 	github.com/adrg/frontmatter v0.2.0
 	github.com/go-enry/go-enry/v2 v2.9.1
+	github.com/gomarkdown/markdown v0.0.0-20241105142532-d03b89096d81
 	github.com/mergestat/timediff v0.0.3
 	github.com/mmcdole/gofeed v1.3.0
 	github.com/rs/zerolog v1.33.0

diff --git a/src/wp2hugo/go.sum b/src/wp2hugo/go.sum
@@ -25,6 +25,8 @@ github.com/go-enry/go-enry/v2 v2.9.1/go.mod h1:9yrj4ES1YrbNb1Wb7/PWYr2bpaCXUGRt0
 github.com/go-enry/go-oniguruma v1.2.1 h1:k8aAMuJfMrqm/56SG2lV9Cfti6tC4x8673aHCcBk+eo=
 github.com/go-enry/go-oniguruma v1.2.1/go.mod h1:bWDhYP+S6xZQgiRL7wlTScFYBe023B6ilRZbCAD5Hf4=
 github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
+github.com/gomarkdown/markdown v0.0.0-20241105142532-d03b89096d81 h1:5lyLWsV+qCkoYqsKUDuycESh9DEIPVKN6iCFeL7ag50=
+github.com/gomarkdown/markdown v0.0.0-20241105142532-d03b89096d81/go.mod h1:JDGcbDT52eL4fju3sZ4TeHGsQwhG9nbDV21aMyhwPoA=
 github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
 github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
 github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4=

diff --git a/src/wp2hugo/internal/hugogenerator/hugopage/hugo_page.go b/src/wp2hugo/internal/hugogenerator/hugopage/hugo_page.go
@@ -2,6 +2,9 @@ package hugopage
 
 import (
 	"fmt"
+	"github.com/gomarkdown/markdown"
+	"github.com/gomarkdown/markdown/ast"
+	"github.com/gomarkdown/markdown/parser"
 	"io"
 	"net/url"
 	"regexp"
@@ -39,8 +42,6 @@ const _customMoreTag = "{{< more >}}"
 const _wordPressTocTag = "[toc]"
 
 var (
-	_markdownPdfLinks   = regexp.MustCompile(`\[.*?]\((.+?\.pdf).*?\)`)
-	_markdownImageLinks = regexp.MustCompile(`!\[.*?]\((.+?)\)`)
 	// E.g. <pre class="EnlighterJSRAW" data-enlighter-language="golang">
 	_preTagExtractor1 = regexp.MustCompile(`<pre class="EnlighterJSRAW" data-enlighter-language="([^"]+?)".*?>([\s\S]*?)</pre>`)
 	// E.g. <pre class="lang:bash" nums="false">
@@ -104,11 +105,11 @@ func (page Page) Write(w io.Writer) error {
 }
 
 func (page *Page) WPMediaLinks() []string {
-	arr1 := getMarkdownLinks(_markdownImageLinks, page.markdown)
+	arr1 := getImageLinks([]byte(page.markdown))
 	arr2 := getMarkdownLinks(_hugoFigureLinks, page.markdown)
 	arr3 := getMarkdownLinks(_hugoParallaxBlurLinks, page.markdown)
 	arr4 := getMarkdownLinks(_hugoAudioLinks, page.markdown)
-	arr5 := getMarkdownLinks(_markdownPdfLinks, page.markdown)
+	arr5 := getPDFLinks([]byte(page.markdown))
 	coverImageURL := page.getCoverImageURL()
 	result := append(append(append(append(arr1, arr2...), arr3...), arr4...), arr5...)
 	if coverImageURL != nil {
@@ -117,6 +118,47 @@ func (page *Page) WPMediaLinks() []string {
 	return result
 }
 
+func getImageLinks(content []byte) []string {
+	extensions := parser.CommonExtensions | parser.AutoHeadingIDs
+	p := parser.NewWithExtensions(extensions)
+	doc := markdown.Parse(content, p)
+
+	var links []string
+	ast.WalkFunc(doc, func(node ast.Node, entering bool) ast.WalkStatus {
+		if img, ok := node.(*ast.Image); ok && entering {
+			links = append(links, string(img.Destination))
+		}
+		return ast.GoToNext
+	})
+
+	log.Debug().
+		Int("count", len(links)).
+		Msg("Image links")
+	return links
+}
+
+func getPDFLinks(content []byte) []string {
+	extensions := parser.CommonExtensions | parser.AutoHeadingIDs
+	p := parser.NewWithExtensions(extensions)
+	doc := markdown.Parse(content, p)
+
+	var links []string
+	ast.WalkFunc(doc, func(node ast.Node, entering bool) ast.WalkStatus {
+		if link, ok := node.(*ast.Link); ok && entering {
+			destination := string(link.Destination)
+			if strings.HasSuffix(strings.ToLower(destination), ".pdf") {
+				links = append(links, destination)
+			}
+		}
+		return ast.GoToNext
+	})
+
+	log.Debug().
+		Int("count", len(links)).
+		Msg("PDF links")
+	return links
+}
+
 func getMarkdownLinks(regex *regexp.Regexp, markdown string) []string {
 	var links []string
 	matches := regex.FindAllStringSubmatch(markdown, -1)