Skip to content

Commit

Permalink
feat: implement base element handling in content scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
fguillot committed Jul 26, 2024
1 parent c0f6e32 commit 368fd64
Show file tree
Hide file tree
Showing 5 changed files with 223 additions and 38 deletions.
25 changes: 17 additions & 8 deletions internal/reader/processor/processor.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, user *model.Us
entry.URL = cleanedURL
}

pageBaseURL := ""
rewrittenURL := rewriteEntryURL(feed, entry)
entryIsNew := store.IsNewEntry(feed.ID, entry.Hash)
if feed.Crawler && (entryIsNew || forceRefresh) {
Expand All @@ -87,12 +88,16 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, user *model.Us
requestBuilder.IgnoreTLSErrors(feed.AllowSelfSignedCertificates)
requestBuilder.DisableHTTP2(feed.DisableHTTP2)

content, scraperErr := scraper.ScrapeWebsite(
scrapedPageBaseURL, extractedContent, scraperErr := scraper.ScrapeWebsite(
requestBuilder,
rewrittenURL,
feed.ScraperRules,
)

if scrapedPageBaseURL != "" {
pageBaseURL = scrapedPageBaseURL
}

if config.Opts.HasMetricsCollector() {
status := "success"
if scraperErr != nil {
Expand All @@ -109,16 +114,20 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, user *model.Us
slog.String("feed_url", feed.FeedURL),
slog.Any("error", scraperErr),
)
} else if content != "" {
} else if extractedContent != "" {
// We replace the entry content only if the scraper doesn't return any error.
entry.Content = minifyEntryContent(content)
entry.Content = minifyEntryContent(extractedContent)
}
}

rewrite.Rewriter(rewrittenURL, entry, feed.RewriteRules)

if pageBaseURL == "" {
pageBaseURL = rewrittenURL
}

// The sanitizer should always run at the end of the process to make sure unsafe HTML is filtered out.
entry.Content = sanitizer.Sanitize(rewrittenURL, entry.Content)
entry.Content = sanitizer.Sanitize(pageBaseURL, entry.Content)

updateEntryReadingTime(store, feed, entry, entryIsNew, user)
filteredEntries = append(filteredEntries, entry)
Expand Down Expand Up @@ -280,7 +289,7 @@ func ProcessEntryWebPage(feed *model.Feed, entry *model.Entry, user *model.User)
requestBuilder.IgnoreTLSErrors(feed.AllowSelfSignedCertificates)
requestBuilder.DisableHTTP2(feed.DisableHTTP2)

content, scraperErr := scraper.ScrapeWebsite(
pageBaseURL, extractedContent, scraperErr := scraper.ScrapeWebsite(
requestBuilder,
rewrittenEntryURL,
feed.ScraperRules,
Expand All @@ -298,15 +307,15 @@ func ProcessEntryWebPage(feed *model.Feed, entry *model.Entry, user *model.User)
return scraperErr
}

if content != "" {
entry.Content = minifyEntryContent(content)
if extractedContent != "" {
entry.Content = minifyEntryContent(extractedContent)
if user.ShowReadingTime {
entry.ReadingTime = readingtime.EstimateReadingTime(entry.Content, user.DefaultReadingSpeed, user.CJKReadingSpeed)
}
}

rewrite.Rewriter(rewrittenEntryURL, entry, entry.Feed.RewriteRules)
entry.Content = sanitizer.Sanitize(rewrittenEntryURL, entry.Content)
entry.Content = sanitizer.Sanitize(pageBaseURL, entry.Content)

return nil
}
Expand Down
17 changes: 13 additions & 4 deletions internal/reader/readability/readability.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (

"github.com/PuerkitoBio/goquery"
"golang.org/x/net/html"
"miniflux.app/v2/internal/urllib"
)

const (
Expand Down Expand Up @@ -69,10 +70,17 @@ func (c candidateList) String() string {
}

// ExtractContent returns relevant content.
func ExtractContent(page io.Reader) (string, error) {
func ExtractContent(page io.Reader) (baseURL string, extractedContent string, err error) {
document, err := goquery.NewDocumentFromReader(page)
if err != nil {
return "", err
return "", "", err
}

if hrefValue, exists := document.Find("head base").First().Attr("href"); exists {
hrefValue = strings.TrimSpace(hrefValue)
if urllib.IsAbsoluteURL(hrefValue) {
baseURL = hrefValue
}
}

document.Find("script,style").Each(func(i int, s *goquery.Selection) {
Expand All @@ -86,12 +94,13 @@ func ExtractContent(page io.Reader) (string, error) {
topCandidate := getTopCandidate(document, candidates)

slog.Debug("Readability parsing",
slog.String("base_url", baseURL),
slog.Any("candidates", candidates),
slog.Any("topCandidate", topCandidate),
)

output := getArticle(topCandidate, candidates)
return output, nil
extractedContent = getArticle(topCandidate, candidates)
return baseURL, extractedContent, nil
}

// Now that we have the top candidate, look through its siblings for content that might also be related.
Expand Down
102 changes: 102 additions & 0 deletions internal/reader/readability/readability_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
// SPDX-License-Identifier: Apache-2.0

package readability // import "miniflux.app/v2/internal/reader/readability"

import (
"strings"
"testing"
)

func TestBaseURL(t *testing.T) {
html := `
<html>
<head>
<base href="https://example.org/ ">
</head>
<body>
<article>
Some content
</article>
</body>
</html>`

baseURL, _, err := ExtractContent(strings.NewReader(html))
if err != nil {
t.Fatal(err)
}

if baseURL != "https://example.org/" {
t.Errorf(`Unexpected base URL, got %q instead of "https://example.org/"`, baseURL)
}
}

func TestMultipleBaseURL(t *testing.T) {
html := `
<html>
<head>
<base href="https://example.org/ ">
<base href="https://example.com/ ">
</head>
<body>
<article>
Some content
</article>
</body>
</html>`

baseURL, _, err := ExtractContent(strings.NewReader(html))
if err != nil {
t.Fatal(err)
}

if baseURL != "https://example.org/" {
t.Errorf(`Unexpected base URL, got %q instead of "https://example.org/"`, baseURL)
}
}

func TestRelativeBaseURL(t *testing.T) {
html := `
<html>
<head>
<base href="/test/ ">
</head>
<body>
<article>
Some content
</article>
</body>
</html>`

baseURL, _, err := ExtractContent(strings.NewReader(html))
if err != nil {
t.Fatal(err)
}

if baseURL != "" {
t.Errorf(`Unexpected base URL, got %q`, baseURL)
}
}

func TestWithoutBaseURL(t *testing.T) {
html := `
<html>
<head>
<title>Test</title>
</head>
<body>
<article>
Some content
</article>
</body>
</html>`

baseURL, _, err := ExtractContent(strings.NewReader(html))
if err != nil {
t.Fatal(err)
}

if baseURL != "" {
t.Errorf(`Unexpected base URL, got %q instead of ""`, baseURL)
}
}
53 changes: 29 additions & 24 deletions internal/reader/scraper/scraper.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,72 +18,77 @@ import (
"golang.org/x/net/html/charset"
)

func ScrapeWebsite(requestBuilder *fetcher.RequestBuilder, websiteURL, rules string) (string, error) {
responseHandler := fetcher.NewResponseHandler(requestBuilder.ExecuteRequest(websiteURL))
func ScrapeWebsite(requestBuilder *fetcher.RequestBuilder, pageURL, rules string) (baseURL string, extractedContent string, err error) {
responseHandler := fetcher.NewResponseHandler(requestBuilder.ExecuteRequest(pageURL))
defer responseHandler.Close()

if localizedError := responseHandler.LocalizedError(); localizedError != nil {
slog.Warn("Unable to scrape website", slog.String("website_url", websiteURL), slog.Any("error", localizedError.Error()))
return "", localizedError.Error()
slog.Warn("Unable to scrape website", slog.String("website_url", pageURL), slog.Any("error", localizedError.Error()))
return "", "", localizedError.Error()
}

if !isAllowedContentType(responseHandler.ContentType()) {
return "", fmt.Errorf("scraper: this resource is not a HTML document (%s)", responseHandler.ContentType())
return "", "", fmt.Errorf("scraper: this resource is not a HTML document (%s)", responseHandler.ContentType())
}

// The entry URL could redirect somewhere else.
sameSite := urllib.Domain(websiteURL) == urllib.Domain(responseHandler.EffectiveURL())
websiteURL = responseHandler.EffectiveURL()
sameSite := urllib.Domain(pageURL) == urllib.Domain(responseHandler.EffectiveURL())
pageURL = responseHandler.EffectiveURL()

if rules == "" {
rules = getPredefinedScraperRules(websiteURL)
rules = getPredefinedScraperRules(pageURL)
}

var content string
var err error

htmlDocumentReader, err := charset.NewReader(
responseHandler.Body(config.Opts.HTTPClientMaxBodySize()),
responseHandler.ContentType(),
)
if err != nil {
return "", fmt.Errorf("scraper: unable to read HTML document: %v", err)
return "", "", fmt.Errorf("scraper: unable to read HTML document with charset reader: %v", err)
}

if sameSite && rules != "" {
slog.Debug("Extracting content with custom rules",
"url", websiteURL,
"url", pageURL,
"rules", rules,
)
content, err = findContentUsingCustomRules(htmlDocumentReader, rules)
baseURL, extractedContent, err = findContentUsingCustomRules(htmlDocumentReader, rules)
} else {
slog.Debug("Extracting content with readability",
"url", websiteURL,
"url", pageURL,
)
content, err = readability.ExtractContent(htmlDocumentReader)
baseURL, extractedContent, err = readability.ExtractContent(htmlDocumentReader)
}

if err != nil {
return "", err
if baseURL == "" {
baseURL = pageURL
} else {
slog.Debug("Using base URL from HTML document", "base_url", baseURL)
}

return content, nil
return baseURL, extractedContent, nil
}

func findContentUsingCustomRules(page io.Reader, rules string) (string, error) {
func findContentUsingCustomRules(page io.Reader, rules string) (baseURL string, extractedContent string, err error) {
document, err := goquery.NewDocumentFromReader(page)
if err != nil {
return "", err
return "", "", err
}

if hrefValue, exists := document.Find("head base").First().Attr("href"); exists {
hrefValue = strings.TrimSpace(hrefValue)
if urllib.IsAbsoluteURL(hrefValue) {
baseURL = hrefValue
}
}

contents := ""
document.Find(rules).Each(func(i int, s *goquery.Selection) {
if content, err := goquery.OuterHtml(s); err == nil {
contents += content
extractedContent += content
}
})

return contents, nil
return baseURL, extractedContent, nil
}

func getPredefinedScraperRules(websiteURL string) string {
Expand Down
Loading

0 comments on commit 368fd64

Please sign in to comment.