Skip to content

Commit

Permalink
feat: impl textract connector
Browse files Browse the repository at this point in the history
  • Loading branch information
denovVasyl committed Aug 20, 2024
1 parent eb254a1 commit fa4b8f6
Show file tree
Hide file tree
Showing 15 changed files with 793 additions and 21 deletions.
7 changes: 7 additions & 0 deletions connectors/aws/aws-textract/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,13 @@
<artifactId>aws-java-sdk-textract</artifactId>
<version>${version.aws-java-sdk}</version>
</dependency>

<dependency>
<groupId>com.amazonaws</groupId>
<artifactId>aws-java-sdk-s3</artifactId>
<version>1.12.767</version>
<scope>compile</scope>
</dependency>

</dependencies>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,35 +6,98 @@
*/
package io.camunda.connector.textract;

import com.amazonaws.services.textract.AmazonTextract;
import com.amazonaws.services.textract.AmazonTextractAsync;
import com.amazonaws.services.textract.model.AnalyzeDocumentResult;
import com.amazonaws.services.textract.model.Block;
import com.amazonaws.services.textract.model.GetDocumentAnalysisResult;
import com.amazonaws.services.textract.model.StartDocumentAnalysisResult;
import io.camunda.connector.api.annotation.OutboundConnector;
import io.camunda.connector.api.outbound.OutboundConnectorContext;
import io.camunda.connector.api.outbound.OutboundConnectorFunction;
import io.camunda.connector.generator.java.annotation.ElementTemplate;
import io.camunda.connector.textract.caller.AsyncTextractCaller;
import io.camunda.connector.textract.caller.PollingTextractCalller;
import io.camunda.connector.textract.caller.SyncTextractCaller;
import io.camunda.connector.textract.model.TextractExecutionType;
import io.camunda.connector.textract.model.TextractRequest;
import io.camunda.connector.textract.suppliers.util.AmazonTextractClientUtil;

import java.util.List;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;

@OutboundConnector(
name = "AWS Textract",
inputVariables = {"authentication", "configuration", "input"},
type = "io.camunda:aws-textract:1")
name = "AWS Textract",
inputVariables = {"authentication", "configuration", "input"},
type = "io.camunda:aws-textract:1")
@ElementTemplate(
id = "io.camunda.connectors.AWSTEXTRACT.v1",
name = "AWS Textract Outbound Connector",
description =
"Automatically extract printed text, handwriting, layout elements, and data from any document",
inputDataClass = TextractRequest.class,
version = 1,
propertyGroups = {
@ElementTemplate.PropertyGroup(id = "authentication", label = "Authentication"),
@ElementTemplate.PropertyGroup(id = "configuration", label = "Configuration"),
@ElementTemplate.PropertyGroup(id = "input", label = "Configure input")
},
documentationRef =
"https://docs.camunda.io/docs/next/components/connectors/out-of-the-box-connectors/amazon-textract/",
icon = "icon.svg")
id = "io.camunda.connectors.AWSTEXTRACT.v1",
name = "AWS Textract Outbound Connector",
description =
"Automatically extract printed text, handwriting, layout elements, and data from any document",
inputDataClass = TextractRequest.class,
version = 1,
propertyGroups = {
@ElementTemplate.PropertyGroup(id = "authentication", label = "Authentication"),
@ElementTemplate.PropertyGroup(id = "configuration", label = "Configuration"),
@ElementTemplate.PropertyGroup(id = "input", label = "Configure input")
},
documentationRef =
"https://docs.camunda.io/docs/next/components/connectors/out-of-the-box-connectors/amazon-textract/",
icon = "icon.svg")
public class TextractConnectorFunction implements OutboundConnectorFunction {

@Override
public Object execute(OutboundConnectorContext context) throws Exception {
return null;
}
private final SyncTextractCaller syncTextractCaller;

private final PollingTextractCalller pollingTextractCaller;

private final AsyncTextractCaller asyncTextractCaller;

public TextractConnectorFunction() {
this.syncTextractCaller = new SyncTextractCaller();
this.pollingTextractCaller = new PollingTextractCalller();
this.asyncTextractCaller = new AsyncTextractCaller();
}

public TextractConnectorFunction(SyncTextractCaller syncTextractCaller, PollingTextractCalller pollingTextractCaller, AsyncTextractCaller asyncTextractCaller) {
this.syncTextractCaller = syncTextractCaller;
this.pollingTextractCaller = pollingTextractCaller;
this.asyncTextractCaller = asyncTextractCaller;
}

@Override
public Object execute(OutboundConnectorContext context) throws Exception {
final var request = context.bindVariables(TextractRequest.class);
final var reqData = request.getInput();

if (reqData.executionType().equals(TextractExecutionType.SYNC)) {
final AmazonTextract syncTextractClient = AmazonTextractClientUtil.getSyncTextractClient(request);
final AnalyzeDocumentResult docResult = syncTextractCaller.call(reqData, syncTextractClient);
final List<Block> blocks = docResult.getBlocks();

return this.fetchText(blocks);
}
if (reqData.executionType().equals(TextractExecutionType.POLLING)) {
final AmazonTextractAsync asyncTextractClient = AmazonTextractClientUtil.getAsyncTextractClient(request);

final GetDocumentAnalysisResult analysisResult = this.pollingTextractCaller.call(reqData, asyncTextractClient);
final List<Block> blocks = analysisResult.getBlocks();

return this.fetchText(blocks);
}

final AmazonTextractAsync asyncTextractClient = AmazonTextractClientUtil.getAsyncTextractClient(request);
final StartDocumentAnalysisResult startDocumentAnalysisResult = this.asyncTextractCaller.call(reqData, asyncTextractClient);

return startDocumentAnalysisResult.getJobId();
}

private Set<String> fetchText(final List<Block> blocks) {
return blocks.stream()
.map(Block::getText)
.filter(Objects::nonNull)
.collect(Collectors.toSet());
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
package io.camunda.connector.textract.caller;

import com.amazonaws.services.textract.AmazonTextract;
import com.amazonaws.services.textract.model.NotificationChannel;
import com.amazonaws.services.textract.model.OutputConfig;
import com.amazonaws.services.textract.model.StartDocumentAnalysisRequest;
import com.amazonaws.services.textract.model.StartDocumentAnalysisResult;
import io.camunda.connector.textract.model.TextractRequestData;
import org.apache.commons.lang3.StringUtils;

public class AsyncTextractCaller implements TextractCaller<StartDocumentAnalysisResult> {

@Override
public StartDocumentAnalysisResult call(TextractRequestData requestData, AmazonTextract textractClient) {
final StartDocumentAnalysisRequest startDocumentAnalysisRequest = new StartDocumentAnalysisRequest()
.withFeatureTypes(this.prepareFeatureTypes(requestData))
.withDocumentLocation(this.prepareDocumentLocation(requestData));

startDocumentAnalysisRequest.withClientRequestToken(requestData.clientRequestToken());

startDocumentAnalysisRequest.withJobTag(requestData.jobTag());

startDocumentAnalysisRequest.withKMSKeyId(requestData.kmsKeyId());

this.prepareNotification(startDocumentAnalysisRequest, requestData);
this.prepareOutput(startDocumentAnalysisRequest, requestData);

return textractClient.startDocumentAnalysis(startDocumentAnalysisRequest);
}

private void prepareNotification(StartDocumentAnalysisRequest startDocumentAnalysisRequest, TextractRequestData requestData) {
if (StringUtils.isEmpty(requestData.notificationChannelRoleArn()) &&
StringUtils.isEmpty(requestData.notificationChannelSnsTopicArn())) {
return;
}

final NotificationChannel notificationChannel = new NotificationChannel();
if (!StringUtils.isEmpty(requestData.notificationChannelRoleArn())) {
notificationChannel.withRoleArn(requestData.notificationChannelRoleArn());
}

if (!StringUtils.isEmpty(requestData.notificationChannelSnsTopicArn())) {
notificationChannel.withSNSTopicArn(requestData.notificationChannelSnsTopicArn());
}

startDocumentAnalysisRequest.withNotificationChannel(notificationChannel);
}

private void prepareOutput(StartDocumentAnalysisRequest startDocumentAnalysisRequest, TextractRequestData requestData) {
if (StringUtils.isEmpty(requestData.outputConfigS3Bucket()) &&
StringUtils.isEmpty(requestData.outputConfigS3Prefix())) {
return;
}

final OutputConfig outputConfig = new OutputConfig();
if (!StringUtils.isEmpty(requestData.outputConfigS3Bucket())) {
outputConfig.withS3Bucket(requestData.outputConfigS3Bucket());
}

if (!StringUtils.isEmpty(requestData.outputConfigS3Prefix())) {
outputConfig.withS3Prefix(requestData.outputConfigS3Prefix());
}

startDocumentAnalysisRequest.withOutputConfig(outputConfig);
}


}
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
package io.camunda.connector.textract.caller;

import com.amazonaws.services.textract.AmazonTextract;
import com.amazonaws.services.textract.AmazonTextractAsync;
import com.amazonaws.services.textract.model.GetDocumentAnalysisRequest;
import com.amazonaws.services.textract.model.GetDocumentAnalysisResult;
import com.amazonaws.services.textract.model.StartDocumentAnalysisRequest;
import com.amazonaws.services.textract.model.StartDocumentAnalysisResult;
import io.camunda.connector.textract.model.TextractRequestData;
import io.camunda.connector.textract.model.TextractTask;

import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ScheduledFuture;

import static java.util.concurrent.TimeUnit.SECONDS;

public class PollingTextractCalller implements TextractCaller<GetDocumentAnalysisResult> {
public final short DELAY_BETWEEN_POLLING = 5;

@Override
public GetDocumentAnalysisResult call(TextractRequestData requestData, AmazonTextract textractClient) throws Exception {
final StartDocumentAnalysisRequest startDocReq = new StartDocumentAnalysisRequest()
.withFeatureTypes(this.prepareFeatureTypes(requestData))
.withDocumentLocation(this.prepareDocumentLocation(requestData));

final StartDocumentAnalysisResult result = textractClient.startDocumentAnalysis(startDocReq);
final var documentAnalysisReq = new GetDocumentAnalysisRequest().withJobId(result.getJobId());
final var textractTask = new TextractTask(documentAnalysisReq, (AmazonTextractAsync) textractClient);

ScheduledFuture<GetDocumentAnalysisResult> future;
try (ScheduledExecutorService executorService = Executors.newSingleThreadScheduledExecutor()) {
future = executorService.schedule(textractTask, 0, SECONDS);

while (this.continuePolling(future.get().getJobStatus())) {
future = executorService.schedule(textractTask, DELAY_BETWEEN_POLLING, SECONDS);
}
}

return future.get();
}

private boolean continuePolling(final String status) {
if (status.equals("SUCCEEDED")) {
return false;
}
return !status.equals("FAILED");

}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
package io.camunda.connector.textract.caller;

import com.amazonaws.services.textract.AmazonTextract;
import com.amazonaws.services.textract.model.AnalyzeDocumentRequest;
import com.amazonaws.services.textract.model.AnalyzeDocumentResult;
import com.amazonaws.services.textract.model.Document;
import com.amazonaws.services.textract.model.S3Object;
import io.camunda.connector.textract.model.TextractRequestData;

public class SyncTextractCaller implements TextractCaller<AnalyzeDocumentResult>{

@Override
public AnalyzeDocumentResult call(TextractRequestData requestData, AmazonTextract textractClient){
final S3Object s3Obj = this.prepareS3Obj(requestData);
final Document document = new Document().withS3Object(s3Obj);

final AnalyzeDocumentRequest analyzeDocumentRequest = new AnalyzeDocumentRequest()
.withFeatureTypes(this.prepareFeatureTypes(requestData))
.withDocument(document);

return textractClient.analyzeDocument(analyzeDocumentRequest);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
package io.camunda.connector.textract.caller;

import com.amazonaws.AmazonWebServiceResult;
import com.amazonaws.ResponseMetadata;
import com.amazonaws.services.textract.AmazonTextract;
import com.amazonaws.services.textract.AmazonTextractAsync;
import com.amazonaws.services.textract.model.DocumentLocation;
import com.amazonaws.services.textract.model.FeatureType;
import com.amazonaws.services.textract.model.S3Object;
import io.camunda.connector.textract.model.TextractRequestData;

import java.util.HashSet;
import java.util.Set;

public interface TextractCaller<T extends AmazonWebServiceResult<ResponseMetadata>> {

T call(final TextractRequestData request, final AmazonTextract textractClient) throws Exception;

default S3Object prepareS3Obj(final TextractRequestData requestData) {
return new S3Object()
.withBucket(requestData.documentS3Bucket())
.withName(requestData.documentName())
.withVersion(requestData.documentVersion());
}

default Set<String> prepareFeatureTypes(final TextractRequestData request) {
final Set<String> types = new HashSet<>();
if (request.analyzeForms()) types.add(FeatureType.FORMS.name());
if (request.analyzeLayout()) types.add(FeatureType.LAYOUT.name());
if (request.analyzeSignatures()) types.add(FeatureType.SIGNATURES.name());
if (request.analyzeTables()) types.add(FeatureType.TABLES.name());
return types;
}

default DocumentLocation prepareDocumentLocation(final TextractRequestData request) {
final S3Object s3Obj = prepareS3Obj(request);
return new DocumentLocation()
.withS3Object(s3Obj);
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
package io.camunda.connector.textract.model;

import com.amazonaws.services.textract.AmazonTextractAsync;
import com.amazonaws.services.textract.model.GetDocumentAnalysisRequest;
import com.amazonaws.services.textract.model.GetDocumentAnalysisResult;

import java.util.concurrent.Callable;

public class TextractTask implements Callable<GetDocumentAnalysisResult> {

private final GetDocumentAnalysisRequest docAnalysisReq;

private final AmazonTextractAsync amazonTextract;

public TextractTask(GetDocumentAnalysisRequest documentAnalysisRequest, AmazonTextractAsync amazonTextract) {
this.docAnalysisReq = documentAnalysisRequest;
this.amazonTextract = amazonTextract;
}

@Override
public GetDocumentAnalysisResult call() {
return this.amazonTextract.getDocumentAnalysis(docAnalysisReq);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
package io.camunda.connector.textract.suppliers.util;

import com.amazonaws.services.textract.AmazonTextract;
import com.amazonaws.services.textract.AmazonTextractAsync;
import com.amazonaws.services.textract.AmazonTextractAsyncClientBuilder;
import com.amazonaws.services.textract.AmazonTextractClientBuilder;
import io.camunda.connector.aws.CredentialsProviderSupport;
import io.camunda.connector.textract.model.TextractRequest;

public class AmazonTextractClientUtil {

private AmazonTextractClientUtil() {
throw new UnsupportedOperationException("This is a utility class and cannot be instantiated");
}

public static AmazonTextract getSyncTextractClient(final TextractRequest request) {
return AmazonTextractClientBuilder.standard()
.withCredentials(CredentialsProviderSupport.credentialsProvider(request))
.withRegion(request.getConfiguration().region())
.build();
}

public static AmazonTextractAsync getAsyncTextractClient(final TextractRequest request) {
return AmazonTextractAsyncClientBuilder.standard()
.withCredentials(CredentialsProviderSupport.credentialsProvider(request))
.withRegion(request.getConfiguration().region())
.build();
}

}
Loading

0 comments on commit fa4b8f6

Please sign in to comment.