From 5089a3c3497fdb571882ef368561ec83637e72db Mon Sep 17 00:00:00 2001 From: hung phan Date: Mon, 6 Jan 2025 14:40:45 +0700 Subject: [PATCH] JAMES-4100 Improve Search Snippet display --- .../SearchHighLighterContract.java | 39 +++++++++++++++++++ .../search/LuceneIndexableDocument.java | 4 +- .../opensearch/json/IndexableMessage.java | 4 +- .../mailbox/store/search/SearchUtil.java | 17 ++++++++ 4 files changed, 60 insertions(+), 4 deletions(-) diff --git a/mailbox/api/src/test/java/org/apache/james/mailbox/searchhighligt/SearchHighLighterContract.java b/mailbox/api/src/test/java/org/apache/james/mailbox/searchhighligt/SearchHighLighterContract.java index 2181140be97..a67b6b294f2 100644 --- a/mailbox/api/src/test/java/org/apache/james/mailbox/searchhighligt/SearchHighLighterContract.java +++ b/mailbox/api/src/test/java/org/apache/james/mailbox/searchhighligt/SearchHighLighterContract.java @@ -532,4 +532,43 @@ default void shouldHighLightBodyWhenHTMLBodyMatched() throws Exception { softly.assertThat(searchSnippets.getFirst().highlightedBody().get()).contains("barcamp"); }); } + + @Test + default void highlightSearchShouldShortenGreaterThanCharacters() throws Exception { + MailboxSession session = session(USERNAME1); + + // Given m1,m2 with m1 has body containing the searched word (contentA) + ComposedMessageId m1 = appendMessage(MessageManager.AppendCommand.from( + Message.Builder.of() + .setTo("to@james.local") + .setSubject("Hallo, Thx Matthieu for your help") + .setBody("Start \n>>>>>>>>>> append contentA to > inbox \n>>>>>> End", + StandardCharsets.UTF_8)), + session).getId(); + + ComposedMessageId m2 = appendMessage(MessageManager.AppendCommand.from( + Message.Builder.of() + .setTo("to@james.local") + .setSubject("Hallo, Thx Alex for your help") + .setBody("append contentB to inbox", StandardCharsets.UTF_8)), + session).getId(); + + verifyMessageWasIndexed(2); + + // When searching for the word (contentA) in the body + MultimailboxesSearchQuery multiMailboxSearch = MultimailboxesSearchQuery.from(SearchQuery.of( + SearchQuery.bodyContains("contentA"))) + .inMailboxes(List.of(m1.getMailboxId(), m2.getMailboxId())) + .build(); + + // Then highlightSearch should return the SearchSnippet with the highlightedBody containing the word (contentA) + List searchSnippets = Flux.from(testee().highlightSearch(List.of(m1.getMessageId(), m2.getMessageId()), multiMailboxSearch, session)) + .collectList() + .block(); + assertThat(searchSnippets).hasSize(1); + assertSoftly(softly -> { + softly.assertThat(searchSnippets.getFirst().messageId()).isEqualTo(m1.getMessageId()); + softly.assertThat(searchSnippets.getFirst().highlightedBody().get()).isEqualTo("Start \n append contentA to > inbox \n End"); + }); + } } diff --git a/mailbox/lucene/src/main/java/org/apache/james/mailbox/lucene/search/LuceneIndexableDocument.java b/mailbox/lucene/src/main/java/org/apache/james/mailbox/lucene/search/LuceneIndexableDocument.java index bee20308d28..6d0222204f5 100644 --- a/mailbox/lucene/src/main/java/org/apache/james/mailbox/lucene/search/LuceneIndexableDocument.java +++ b/mailbox/lucene/src/main/java/org/apache/james/mailbox/lucene/search/LuceneIndexableDocument.java @@ -195,8 +195,8 @@ public Document createMessageDocument(MailboxMessage message, MailboxSession ses doc.add(new TextField(BCC_FIELD, uppercase(EMailers.from(headerCollection.getBccAddressSet()).serialize()), Field.Store.YES)); // index body - Optional bodyText = mimePartExtracted.locateFirstTextBody(); - Optional bodyHtml = mimePartExtracted.locateFirstHtmlBody(); + Optional bodyText = mimePartExtracted.locateFirstTextBody().map(SearchUtil::removeGreaterThanCharacters); + Optional bodyHtml = mimePartExtracted.locateFirstHtmlBody().map(SearchUtil::removeGreaterThanCharacters); bodyText.or(() -> bodyHtml) .ifPresent(bodyContent -> doc.add(new TextField(BODY_FIELD, bodyContent, Field.Store.YES))); diff --git a/mailbox/opensearch/src/main/java/org/apache/james/mailbox/opensearch/json/IndexableMessage.java b/mailbox/opensearch/src/main/java/org/apache/james/mailbox/opensearch/json/IndexableMessage.java index 722d8b8988e..20804d8688c 100644 --- a/mailbox/opensearch/src/main/java/org/apache/james/mailbox/opensearch/json/IndexableMessage.java +++ b/mailbox/opensearch/src/main/java/org/apache/james/mailbox/opensearch/json/IndexableMessage.java @@ -135,8 +135,8 @@ private Mono instantiateIndexedMessage() throws IOException, M .asMimePart(textExtractor) .map(parsingResult -> { - Optional bodyText = parsingResult.locateFirstTextBody(); - Optional bodyHtml = parsingResult.locateFirstHtmlBody(); + Optional bodyText = parsingResult.locateFirstTextBody().map(SearchUtil::removeGreaterThanCharacters); + Optional bodyHtml = parsingResult.locateFirstHtmlBody().map(SearchUtil::removeGreaterThanCharacters); boolean hasAttachment = MessageAttachmentMetadata.hasNonInlinedAttachment(message.getAttachments()); List attachments = setFlattenedAttachments(parsingResult, indexAttachments); diff --git a/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/SearchUtil.java b/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/SearchUtil.java index 8c7686f60c4..0574b4a69a4 100644 --- a/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/SearchUtil.java +++ b/mailbox/store/src/main/java/org/apache/james/mailbox/store/search/SearchUtil.java @@ -473,5 +473,22 @@ public boolean test(MessageId input) { }; } + public static String removeGreaterThanCharacters(String text) { + StringBuilder result = new StringBuilder(); + boolean isNewLine = false; + for (int i = 0; i < text.length(); i++) { + char current = text.charAt(i); + + if (current == '\n') { + isNewLine = true; + result.append(current); + } else if (!isNewLine || current != '>') { + result.append(current); + isNewLine = false; + } + } + + return result.toString(); + } }