Skip to content

Commit

Permalink
fix: fix handling of image and PDF links with special characters (#84)
Browse files Browse the repository at this point in the history
Handling of other media files like audio is still broken and not worth
the fix right now.

From the attached GitHub issue

> Aeons ago I made [a blog post](https://habi.gna.ch/2003/07/26/canyoning/) including the image `https://habi.gna.ch/blog/images/Picture(2).jpg`
>
> When calling `wp2hugo` with
> ```bash
> ./src/wp2hugo/bin/wp2hugo -source habignach.WordPress.2024-10-29.xml  -download-media
> ```
> the process fails with
> ```bash
> 05:33:50PM DBG hugo_gen_setup.go:412 > Embedded media links links=1 page=https://habi.gna.ch/2003/07/26/canyoning/
> 05:33:50PM DBG hugo_gen_setup.go:416 > Downloading media files links=1
> 05:33:50PM INF media_cache_setup.go:33 > media https://habi.gna.ch/blog/images/Picture(2 will be fetched
> 05:33:50PM FTL main.go:43 > Error: error fetching media file https://habi.gna.ch/blog/images/Picture(2: error fetching media https://habi.gna.ch/blog/images/Picture(2: 404 Not Found
> ```
>
> I think the fetching link needs escaping of the parenthesis :)
>
> `grep "Picture(2" habignach.WordPress.2024-10-29.xml` returns `<a href="http://habi.gna.ch/blog/images/Picture(2).jpg"><img src="http://habi.gna.ch/blog/images/Picture(2)-tm.jpg" height="288" width="352" align="middle" border="2" hspace="0" vspace="0" alt="" longdesc="" /></a><p>` BTW

Ref: #81
  • Loading branch information
ashishb authored Nov 18, 2024
1 parent 9adcb21 commit b1da435
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 4 deletions.
1 change: 1 addition & 0 deletions src/wp2hugo/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ require (
github.com/PuerkitoBio/goquery v1.10.0
github.com/adrg/frontmatter v0.2.0
github.com/go-enry/go-enry/v2 v2.9.1
github.com/gomarkdown/markdown v0.0.0-20241105142532-d03b89096d81
github.com/mergestat/timediff v0.0.3
github.com/mmcdole/gofeed v1.3.0
github.com/rs/zerolog v1.33.0
Expand Down
2 changes: 2 additions & 0 deletions src/wp2hugo/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ github.com/go-enry/go-enry/v2 v2.9.1/go.mod h1:9yrj4ES1YrbNb1Wb7/PWYr2bpaCXUGRt0
github.com/go-enry/go-oniguruma v1.2.1 h1:k8aAMuJfMrqm/56SG2lV9Cfti6tC4x8673aHCcBk+eo=
github.com/go-enry/go-oniguruma v1.2.1/go.mod h1:bWDhYP+S6xZQgiRL7wlTScFYBe023B6ilRZbCAD5Hf4=
github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
github.com/gomarkdown/markdown v0.0.0-20241105142532-d03b89096d81 h1:5lyLWsV+qCkoYqsKUDuycESh9DEIPVKN6iCFeL7ag50=
github.com/gomarkdown/markdown v0.0.0-20241105142532-d03b89096d81/go.mod h1:JDGcbDT52eL4fju3sZ4TeHGsQwhG9nbDV21aMyhwPoA=
github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4=
Expand Down
50 changes: 46 additions & 4 deletions src/wp2hugo/internal/hugogenerator/hugopage/hugo_page.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@ package hugopage

import (
"fmt"
"github.com/gomarkdown/markdown"
"github.com/gomarkdown/markdown/ast"
"github.com/gomarkdown/markdown/parser"
"io"
"net/url"
"regexp"
Expand Down Expand Up @@ -39,8 +42,6 @@ const _customMoreTag = "{{< more >}}"
const _wordPressTocTag = "[toc]"

var (
_markdownPdfLinks = regexp.MustCompile(`\[.*?]\((.+?\.pdf).*?\)`)
_markdownImageLinks = regexp.MustCompile(`!\[.*?]\((.+?)\)`)
// E.g. <pre class="EnlighterJSRAW" data-enlighter-language="golang">
_preTagExtractor1 = regexp.MustCompile(`<pre class="EnlighterJSRAW" data-enlighter-language="([^"]+?)".*?>([\s\S]*?)</pre>`)
// E.g. <pre class="lang:bash" nums="false">
Expand Down Expand Up @@ -104,11 +105,11 @@ func (page Page) Write(w io.Writer) error {
}

func (page *Page) WPMediaLinks() []string {
arr1 := getMarkdownLinks(_markdownImageLinks, page.markdown)
arr1 := getImageLinks([]byte(page.markdown))
arr2 := getMarkdownLinks(_hugoFigureLinks, page.markdown)
arr3 := getMarkdownLinks(_hugoParallaxBlurLinks, page.markdown)
arr4 := getMarkdownLinks(_hugoAudioLinks, page.markdown)
arr5 := getMarkdownLinks(_markdownPdfLinks, page.markdown)
arr5 := getPDFLinks([]byte(page.markdown))
coverImageURL := page.getCoverImageURL()
result := append(append(append(append(arr1, arr2...), arr3...), arr4...), arr5...)
if coverImageURL != nil {
Expand All @@ -117,6 +118,47 @@ func (page *Page) WPMediaLinks() []string {
return result
}

func getImageLinks(content []byte) []string {
extensions := parser.CommonExtensions | parser.AutoHeadingIDs
p := parser.NewWithExtensions(extensions)
doc := markdown.Parse(content, p)

var links []string
ast.WalkFunc(doc, func(node ast.Node, entering bool) ast.WalkStatus {
if img, ok := node.(*ast.Image); ok && entering {
links = append(links, string(img.Destination))
}
return ast.GoToNext
})

log.Debug().
Int("count", len(links)).
Msg("Image links")
return links
}

func getPDFLinks(content []byte) []string {
extensions := parser.CommonExtensions | parser.AutoHeadingIDs
p := parser.NewWithExtensions(extensions)
doc := markdown.Parse(content, p)

var links []string
ast.WalkFunc(doc, func(node ast.Node, entering bool) ast.WalkStatus {
if link, ok := node.(*ast.Link); ok && entering {
destination := string(link.Destination)
if strings.HasSuffix(strings.ToLower(destination), ".pdf") {
links = append(links, destination)
}
}
return ast.GoToNext
})

log.Debug().
Int("count", len(links)).
Msg("PDF links")
return links
}

func getMarkdownLinks(regex *regexp.Regexp, markdown string) []string {
var links []string
matches := regex.FindAllStringSubmatch(markdown, -1)
Expand Down

0 comments on commit b1da435

Please sign in to comment.