From a6f9f32d8c0a63f262b62391d3ec1d15510ad29b Mon Sep 17 00:00:00 2001 From: hungphan227 <45198168+hungphan227@users.noreply.github.com> Date: Wed, 22 Jan 2025 15:18:30 +0700 Subject: [PATCH] JAMES-4100 Improve Search Snippet display (#2583) Co-authored-by: hung phan --- .../SearchHighLighterContract.java | 39 +++++++++++++++++++ .../search/LuceneIndexableDocument.java | 2 +- .../search/LuceneSearchHighlighter.java | 2 + .../opensearch/json/IndexableMessage.java | 2 +- .../mailbox/store/search/SearchUtil.java | 17 ++++++++ .../SearchSnippetGetMethodContract.scala | 6 +-- 6 files changed, 63 insertions(+), 5 deletions(-) diff --git a/mailbox/api/src/test/java/org/apache/james/mailbox/searchhighligt/SearchHighLighterContract.java b/mailbox/api/src/test/java/org/apache/james/mailbox/searchhighligt/SearchHighLighterContract.java index 2181140be97..998b8f2ac23 100644 --- a/mailbox/api/src/test/java/org/apache/james/mailbox/searchhighligt/SearchHighLighterContract.java +++ b/mailbox/api/src/test/java/org/apache/james/mailbox/searchhighligt/SearchHighLighterContract.java @@ -532,4 +532,43 @@ default void shouldHighLightBodyWhenHTMLBodyMatched() throws Exception { softly.assertThat(searchSnippets.getFirst().highlightedBody().get()).contains("barcamp"); }); } + + @Test + default void highlightSearchShouldShortenGreaterThanCharacters() throws Exception { + MailboxSession session = session(USERNAME1); + + // Given m1,m2 with m1 has body containing the searched word (contentA) + ComposedMessageId m1 = appendMessage(MessageManager.AppendCommand.from( + Message.Builder.of() + .setTo("to@james.local") + .setSubject("Hallo, Thx Matthieu for your help") + .setBody("Start \n>>>>>>>>>> append contentA to > inbox \n>>>>>> End", + StandardCharsets.UTF_8)), + session).getId(); + + ComposedMessageId m2 = appendMessage(MessageManager.AppendCommand.from( + Message.Builder.of() + .setTo("to@james.local") + .setSubject("Hallo, Thx Alex for your help") + .setBody("append contentB to inbox", StandardCharsets.UTF_8)), + session).getId(); + + verifyMessageWasIndexed(2); + + // When searching for the word (contentA) in the body + MultimailboxesSearchQuery multiMailboxSearch = MultimailboxesSearchQuery.from(SearchQuery.of( + SearchQuery.bodyContains("contentA"))) + .inMailboxes(List.of(m1.getMailboxId(), m2.getMailboxId())) + .build(); + + // Then highlightSearch should return the SearchSnippet with the highlightedBody containing the word (contentA) + List searchSnippets = Flux.from(testee().highlightSearch(List.of(m1.getMessageId(), m2.getMessageId()), multiMailboxSearch, session)) + .collectList() + .block(); + assertThat(searchSnippets).hasSize(1); + assertSoftly(softly -> { + softly.assertThat(searchSnippets.getFirst().messageId()).isEqualTo(m1.getMessageId()); + softly.assertThat(searchSnippets.getFirst().highlightedBody().get()).isEqualTo("Start \n append contentA to > inbox \n End"); + }); + } } diff --git a/mailbox/lucene/src/main/java/org/apache/james/mailbox/lucene/search/LuceneIndexableDocument.java b/mailbox/lucene/src/main/java/org/apache/james/mailbox/lucene/search/LuceneIndexableDocument.java index bee20308d28..4f14d14c3f8 100644 --- a/mailbox/lucene/src/main/java/org/apache/james/mailbox/lucene/search/LuceneIndexableDocument.java +++ b/mailbox/lucene/src/main/java/org/apache/james/mailbox/lucene/search/LuceneIndexableDocument.java @@ -195,7 +195,7 @@ public Document createMessageDocument(MailboxMessage message, MailboxSession ses doc.add(new TextField(BCC_FIELD, uppercase(EMailers.from(headerCollection.getBccAddressSet()).serialize()), Field.Store.YES)); // index body - Optional bodyText = mimePartExtracted.locateFirstTextBody(); + Optional bodyText = mimePartExtracted.locateFirstTextBody().map(SearchUtil::removeGreaterThanCharactersAtBeginningOfLine); Optional bodyHtml = mimePartExtracted.locateFirstHtmlBody(); bodyText.or(() -> bodyHtml) diff --git a/mailbox/lucene/src/main/java/org/apache/james/mailbox/lucene/search/LuceneSearchHighlighter.java b/mailbox/lucene/src/main/java/org/apache/james/mailbox/lucene/search/LuceneSearchHighlighter.java index c398865df8c..8a25d922c4a 100644 --- a/mailbox/lucene/src/main/java/org/apache/james/mailbox/lucene/search/LuceneSearchHighlighter.java +++ b/mailbox/lucene/src/main/java/org/apache/james/mailbox/lucene/search/LuceneSearchHighlighter.java @@ -57,6 +57,7 @@ import org.apache.lucene.search.highlight.Formatter; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.QueryScorer; +import org.apache.lucene.search.highlight.SimpleHTMLEncoder; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import org.apache.lucene.search.highlight.SimpleSpanFragmenter; @@ -126,6 +127,7 @@ private Highlighter highlighter(SearchQuery searchQuery) { Query query = buildQueryFromSearchQuery(searchQuery); QueryScorer scorer = new QueryScorer(query); Highlighter highlighter = new Highlighter(formatter, scorer); + highlighter.setEncoder(new SimpleHTMLEncoder()); highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer, configuration.fragmentSize())); return highlighter; } diff --git a/mailbox/opensearch/src/main/java/org/apache/james/mailbox/opensearch/json/IndexableMessage.java b/mailbox/opensearch/src/main/java/org/apache/james/mailbox/opensearch/json/IndexableMessage.java index 722d8b8988e..8bfaf646f88 100644 --- a/mailbox/opensearch/src/main/java/org/apache/james/mailbox/opensearch/json/IndexableMessage.java +++ b/mailbox/opensearch/src/main/java/org/apache/james/mailbox/opensearch/json/IndexableMessage.java @@ -135,7 +135,7 @@ private Mono instantiateIndexedMessage() throws IOException, M .asMimePart(textExtractor) .map(parsingResult -> { - Optional bodyText = parsingResult.locateFirstTextBody(); + Optional bodyText = parsingResult.locateFirstTextBody().map(SearchUtil::removeGreaterThanCharactersAtBeginningOfLine); Optional bodyHtml = parsingResult.locateFirstHtmlBody(); boolean hasAttachment = MessageAttachmentMetadata.hasNonInlinedAttachment(message.getAttachments()); diff --git a/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/SearchUtil.java b/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/SearchUtil.java index 8c7686f60c4..0a90b132b65 100644 --- a/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/SearchUtil.java +++ b/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/SearchUtil.java @@ -473,5 +473,22 @@ public boolean test(MessageId input) { }; } + public static String removeGreaterThanCharactersAtBeginningOfLine(String text) { + StringBuilder result = new StringBuilder(); + boolean isNewLine = false; + for (int i = 0; i < text.length(); i++) { + char current = text.charAt(i); + + if (current == '\n') { + isNewLine = true; + result.append(current); + } else if (!isNewLine || current != '>') { + result.append(current); + isNewLine = false; + } + } + + return result.toString(); + } } diff --git a/server/protocols/jmap-rfc-8621-integration-tests/jmap-rfc-8621-integration-tests-common/src/main/scala/org/apache/james/jmap/rfc8621/contract/SearchSnippetGetMethodContract.scala b/server/protocols/jmap-rfc-8621-integration-tests/jmap-rfc-8621-integration-tests-common/src/main/scala/org/apache/james/jmap/rfc8621/contract/SearchSnippetGetMethodContract.scala index 7246e6a80ed..17cd4d3cf92 100644 --- a/server/protocols/jmap-rfc-8621-integration-tests/jmap-rfc-8621-integration-tests-common/src/main/scala/org/apache/james/jmap/rfc8621/contract/SearchSnippetGetMethodContract.scala +++ b/server/protocols/jmap-rfc-8621-integration-tests/jmap-rfc-8621-integration-tests-common/src/main/scala/org/apache/james/jmap/rfc8621/contract/SearchSnippetGetMethodContract.scala @@ -529,12 +529,12 @@ trait SearchSnippetGetMethodContract { | "list": [ | { | "emailId": "${messageId1.serialize}", - | "subject": "Weekly report - vttran 27/02-03/03/2023", + | "subject": "Weekly report - vttran 27/02-03/03/2023", | "preview": null | }, | { | "emailId": "${messageId2.serialize}", - | "subject": "Weekly report - vttran 19/08-23/08/2024", + | "subject": "Weekly report - vttran 19/08-23/08/2024", | "preview": null | } | ], @@ -610,7 +610,7 @@ trait SearchSnippetGetMethodContract { | "list": [ | { | "emailId": "${messageId1.serialize}", - | "subject": "Weekly report - vttran 27/02-03/03/2023", + | "subject": "Weekly report - vttran 27/02-03/03/2023", | "preview": null | }, | {