From 1d3561b7c4202f45db5f7443e9e39f798fb4667d Mon Sep 17 00:00:00 2001 From: Yicong Huang <17627829+Yicong-Huang@users.noreply.github.com> Date: Mon, 30 Dec 2024 16:46:45 -0800 Subject: [PATCH 01/10] Remove duplicated scalapb definition (#3182) The scalapb proto definition both present in workflow-core and amber. This PR removes the second copy. --- .../src/main/protobuf/scalapb/scalapb.proto | 363 ------------------ .../src/main/protobuf/scalapb/scalapb.proto | 10 +- 2 files changed, 5 insertions(+), 368 deletions(-) delete mode 100644 core/amber/src/main/protobuf/scalapb/scalapb.proto diff --git a/core/amber/src/main/protobuf/scalapb/scalapb.proto b/core/amber/src/main/protobuf/scalapb/scalapb.proto deleted file mode 100644 index d35d373f391..00000000000 --- a/core/amber/src/main/protobuf/scalapb/scalapb.proto +++ /dev/null @@ -1,363 +0,0 @@ -syntax = "proto2"; - -package scalapb; - -option java_package = "scalapb.options"; - -option (options) = { - package_name: "scalapb.options" - flat_package: true -}; - -import "google/protobuf/descriptor.proto"; - -message ScalaPbOptions { - // If set then it overrides the java_package and package. - optional string package_name = 1; - - // If true, the compiler does not append the proto base file name - // into the generated package name. If false (the default), the - // generated scala package name is the package_name.basename where - // basename is the proto file name without the .proto extension. - optional bool flat_package = 2; - - // Adds the following imports at the top of the file (this is meant - // to provide implicit TypeMappers) - repeated string import = 3; - - // Text to add to the generated scala file. This can be used only - // when single_file is true. - repeated string preamble = 4; - - // If true, all messages and enums (but not services) will be written - // to a single Scala file. - optional bool single_file = 5; - - // By default, wrappers defined at - // https://github.com/google/protobuf/blob/master/src/google/protobuf/wrappers.proto, - // are mapped to an Option[T] where T is a primitive type. When this field - // is set to true, we do not perform this transformation. - optional bool no_primitive_wrappers = 7; - - // DEPRECATED. In ScalaPB <= 0.5.47, it was necessary to explicitly enable - // primitive_wrappers. This field remains here for backwards compatibility, - // but it has no effect on generated code. It is an error to set both - // `primitive_wrappers` and `no_primitive_wrappers`. - optional bool primitive_wrappers = 6; - - // Scala type to be used for repeated fields. If unspecified, - // `scala.collection.Seq` will be used. - optional string collection_type = 8; - - // If set to true, all generated messages in this file will preserve unknown - // fields. - optional bool preserve_unknown_fields = 9 [default = true]; - - // If defined, sets the name of the file-level object that would be generated. This - // object extends `GeneratedFileObject` and contains descriptors, and list of message - // and enum companions. - optional string object_name = 10; - - // Whether to apply the options only to this file, or for the entire package (and its subpackages) - enum OptionsScope { - // Apply the options for this file only (default) - FILE = 0; - - // Apply the options for the entire package and its subpackages. - PACKAGE = 1; - } - // Experimental: scope to apply the given options. - optional OptionsScope scope = 11; - - // If true, lenses will be generated. - optional bool lenses = 12 [default = true]; - - // If true, then source-code info information will be included in the - // generated code - normally the source code info is cleared out to reduce - // code size. The source code info is useful for extracting source code - // location from the descriptors as well as comments. - optional bool retain_source_code_info = 13; - - // Scala type to be used for maps. If unspecified, - // `scala.collection.immutable.Map` will be used. - optional string map_type = 14; - - // If true, no default values will be generated in message constructors. - optional bool no_default_values_in_constructor = 15; - - /* Naming convention for generated enum values */ - enum EnumValueNaming { - AS_IN_PROTO = 0; // Enum value names in Scala use the same name as in the proto - CAMEL_CASE = 1; // Convert enum values to CamelCase in Scala. - } - optional EnumValueNaming enum_value_naming = 16; - - // Indicate if prefix (enum name + optional underscore) should be removed in scala code - // Strip is applied before enum value naming changes. - optional bool enum_strip_prefix = 17 [default = false]; - - // Scala type to use for bytes fields. - optional string bytes_type = 21; - - // Enable java conversions for this file. - optional bool java_conversions = 23; - - // AuxMessageOptions enables you to set message-level options through package-scoped options. - // This is useful when you can't add a dependency on scalapb.proto from the proto file that - // defines the message. - message AuxMessageOptions { - // The fully-qualified name of the message in the proto name space. - optional string target = 1; - - // Options to apply to the message. If there are any options defined on the target message - // they take precedence over the options. - optional MessageOptions options = 2; - } - - // AuxFieldOptions enables you to set field-level options through package-scoped options. - // This is useful when you can't add a dependency on scalapb.proto from the proto file that - // defines the field. - message AuxFieldOptions { - // The fully-qualified name of the field in the proto name space. - optional string target = 1; - - // Options to apply to the field. If there are any options defined on the target message - // they take precedence over the options. - optional FieldOptions options = 2; - } - - // AuxEnumOptions enables you to set enum-level options through package-scoped options. - // This is useful when you can't add a dependency on scalapb.proto from the proto file that - // defines the enum. - message AuxEnumOptions { - // The fully-qualified name of the enum in the proto name space. - optional string target = 1; - - // Options to apply to the enum. If there are any options defined on the target enum - // they take precedence over the options. - optional EnumOptions options = 2; - } - - // AuxEnumValueOptions enables you to set enum value level options through package-scoped - // options. This is useful when you can't add a dependency on scalapb.proto from the proto - // file that defines the enum. - message AuxEnumValueOptions { - // The fully-qualified name of the enum value in the proto name space. - optional string target = 1; - - // Options to apply to the enum value. If there are any options defined on - // the target enum value they take precedence over the options. - optional EnumValueOptions options = 2; - } - - // List of message options to apply to some messages. - repeated AuxMessageOptions aux_message_options = 18; - - // List of message options to apply to some fields. - repeated AuxFieldOptions aux_field_options = 19; - - // List of message options to apply to some enums. - repeated AuxEnumOptions aux_enum_options = 20; - - // List of enum value options to apply to some enum values. - repeated AuxEnumValueOptions aux_enum_value_options = 22; - - // List of preprocessors to apply. - repeated string preprocessors = 24; - - repeated FieldTransformation field_transformations = 25; - - // Ignores all transformations for this file. This is meant to allow specific files to - // opt out from transformations inherited through package-scoped options. - optional bool ignore_all_transformations = 26; - - // If true, getters will be generated. - optional bool getters = 27 [default = true]; - - // For use in tests only. Inhibit Java conversions even when when generator parameters - // request for it. - optional bool test_only_no_java_conversions = 999; - - extensions 1000 to max; -} - -extend google.protobuf.FileOptions { - // File-level optionals for ScalaPB. - // Extension number officially assigned by protobuf-global-extension-registry@google.com - optional ScalaPbOptions options = 1020; -} - -message MessageOptions { - // Additional classes and traits to mix in to the case class. - repeated string extends = 1; - - // Additional classes and traits to mix in to the companion object. - repeated string companion_extends = 2; - - // Custom annotations to add to the generated case class. - repeated string annotations = 3; - - // All instances of this message will be converted to this type. An implicit TypeMapper - // must be present. - optional string type = 4; - - // Custom annotations to add to the companion object of the generated class. - repeated string companion_annotations = 5; - - // Additional classes and traits to mix in to generated sealed_oneof base trait. - repeated string sealed_oneof_extends = 6; - - // If true, when this message is used as an optional field, do not wrap it in an `Option`. - // This is equivalent of setting `(field).no_box` to true on each field with the message type. - optional bool no_box = 7; - - // Custom annotations to add to the generated `unknownFields` case class field. - repeated string unknown_fields_annotations = 8; - - extensions 1000 to max; -} - -extend google.protobuf.MessageOptions { - // Message-level optionals for ScalaPB. - // Extension number officially assigned by protobuf-global-extension-registry@google.com - optional MessageOptions message = 1020; -} - -// Represents a custom Collection type in Scala. This allows ScalaPB to integrate with -// collection types that are different enough from the ones in the standard library. -message Collection { - // Type of the collection - optional string type = 1; - - // Set to true if this collection type is not allowed to be empty, for example - // cats.data.NonEmptyList. When true, ScalaPB will not generate `clearX` for the repeated - // field and not provide a default argument in the constructor. - optional bool non_empty = 2; - - // An Adapter is a Scala object available at runtime that provides certain static methods - // that can operate on this collection type. - optional string adapter = 3; -} - -message FieldOptions { - optional string type = 1; - - optional string scala_name = 2; - - // Can be specified only if this field is repeated. If unspecified, - // it falls back to the file option named `collection_type`, which defaults - // to `scala.collection.Seq`. - optional string collection_type = 3; - - optional Collection collection = 8; - - // If the field is a map, you can specify custom Scala types for the key - // or value. - optional string key_type = 4; - optional string value_type = 5; - - // Custom annotations to add to the field. - repeated string annotations = 6; - - // Can be specified only if this field is a map. If unspecified, - // it falls back to the file option named `map_type` which defaults to - // `scala.collection.immutable.Map` - optional string map_type = 7; - - // Do not box this value in Option[T]. If set, this overrides MessageOptions.no_box - optional bool no_box = 30; - - // Like no_box it does not box a value in Option[T], but also fails parsing when a value - // is not provided. This enables to emulate required fields in proto3. - optional bool required = 31; - - extensions 1000 to max; -} - -extend google.protobuf.FieldOptions { - // Field-level optionals for ScalaPB. - // Extension number officially assigned by protobuf-global-extension-registry@google.com - optional FieldOptions field = 1020; -} - -message EnumOptions { - // Additional classes and traits to mix in to the base trait - repeated string extends = 1; - - // Additional classes and traits to mix in to the companion object. - repeated string companion_extends = 2; - - // All instances of this enum will be converted to this type. An implicit TypeMapper - // must be present. - optional string type = 3; - - // Custom annotations to add to the generated enum's base class. - repeated string base_annotations = 4; - - // Custom annotations to add to the generated trait. - repeated string recognized_annotations = 5; - - // Custom annotations to add to the generated Unrecognized case class. - repeated string unrecognized_annotations = 6; - - extensions 1000 to max; -} - -extend google.protobuf.EnumOptions { - // Enum-level optionals for ScalaPB. - // Extension number officially assigned by protobuf-global-extension-registry@google.com - // - // The field is called enum_options and not enum since enum is not allowed in Java. - optional EnumOptions enum_options = 1020; -} - -message EnumValueOptions { - // Additional classes and traits to mix in to an individual enum value. - repeated string extends = 1; - - // Name in Scala to use for this enum value. - optional string scala_name = 2; - - // Custom annotations to add to the generated case object for this enum value. - repeated string annotations = 3; - - extensions 1000 to max; -} - -extend google.protobuf.EnumValueOptions { - // Enum-level optionals for ScalaPB. - // Extension number officially assigned by protobuf-global-extension-registry@google.com - optional EnumValueOptions enum_value = 1020; -} - -message OneofOptions { - // Additional traits to mix in to a oneof. - repeated string extends = 1; - - // Name in Scala to use for this oneof field. - optional string scala_name = 2; - - extensions 1000 to max; -} - -extend google.protobuf.OneofOptions { - // Enum-level optionals for ScalaPB. - // Extension number officially assigned by protobuf-global-extension-registry@google.com - optional OneofOptions oneof = 1020; -} - -enum MatchType { - CONTAINS = 0; - EXACT = 1; - PRESENCE = 2; -} - -message FieldTransformation { - optional google.protobuf.FieldDescriptorProto when = 1; - optional MatchType match_type = 2 [default = CONTAINS]; - optional google.protobuf.FieldOptions set = 3; -} - -message PreprocessorOutput { - map options_by_file = 1; -} diff --git a/core/workflow-core/src/main/protobuf/scalapb/scalapb.proto b/core/workflow-core/src/main/protobuf/scalapb/scalapb.proto index bf58fe15204..d35d373f391 100644 --- a/core/workflow-core/src/main/protobuf/scalapb/scalapb.proto +++ b/core/workflow-core/src/main/protobuf/scalapb/scalapb.proto @@ -51,7 +51,7 @@ message ScalaPbOptions { // If set to true, all generated messages in this file will preserve unknown // fields. - optional bool preserve_unknown_fields = 9 [default=true]; + optional bool preserve_unknown_fields = 9 [default = true]; // If defined, sets the name of the file-level object that would be generated. This // object extends `GeneratedFileObject` and contains descriptors, and list of message @@ -70,7 +70,7 @@ message ScalaPbOptions { optional OptionsScope scope = 11; // If true, lenses will be generated. - optional bool lenses = 12 [default=true]; + optional bool lenses = 12 [default = true]; // If true, then source-code info information will be included in the // generated code - normally the source code info is cleared out to reduce @@ -94,7 +94,7 @@ message ScalaPbOptions { // Indicate if prefix (enum name + optional underscore) should be removed in scala code // Strip is applied before enum value naming changes. - optional bool enum_strip_prefix = 17 [default=false]; + optional bool enum_strip_prefix = 17 [default = false]; // Scala type to use for bytes fields. optional string bytes_type = 21; @@ -172,7 +172,7 @@ message ScalaPbOptions { optional bool ignore_all_transformations = 26; // If true, getters will be generated. - optional bool getters = 27 [default=true]; + optional bool getters = 27 [default = true]; // For use in tests only. Inhibit Java conversions even when when generator parameters // request for it. @@ -354,7 +354,7 @@ enum MatchType { message FieldTransformation { optional google.protobuf.FieldDescriptorProto when = 1; - optional MatchType match_type = 2 [default=CONTAINS]; + optional MatchType match_type = 2 [default = CONTAINS]; optional google.protobuf.FieldOptions set = 3; } From 4ea5fac9741eaa0dd3bf5a26e532a6d85de161d9 Mon Sep 17 00:00:00 2001 From: Yicong Huang <17627829+Yicong-Huang@users.noreply.github.com> Date: Mon, 30 Dec 2024 17:28:57 -0800 Subject: [PATCH 02/10] Fix python proto gen (#3184) The Python protobuf-generated code was outdated. This PR updates the generation script to include all protobuf definitions from the workflow-core and amber sub-projects, ensuring that the latest Python code is generated and aligned with the current protobuf definitions. --- .../actorcommand/backpressure_handler.py | 2 +- .../architecture/managers/pause_manager.py | 2 +- .../managers/statistics_manager.py | 2 +- .../managers/tuple_processing_manager.py | 2 +- .../architecture/packaging/input_manager.py | 2 +- .../architecture/packaging/output_manager.py | 2 +- .../core/architecture/rpc/async_rpc_client.py | 6 +- .../core/architecture/rpc/async_rpc_server.py | 6 +- .../sendsemantics/broad_cast_partitioner.py | 2 +- .../hash_based_shuffle_partitioner.py | 2 +- .../sendsemantics/one_to_one_partitioner.py | 2 +- .../architecture/sendsemantics/partitioner.py | 2 +- .../range_based_shuffle_partitioner.py | 2 +- .../sendsemantics/round_robin_partitioner.py | 2 +- .../python/core/models/internal_marker.py | 2 +- .../main/python/core/models/internal_queue.py | 3 +- .../main/python/core/runnables/main_loop.py | 7 +- .../python/core/runnables/network_sender.py | 2 +- .../core/runnables/test_console_message.py | 2 +- .../python/core/runnables/test_main_loop.py | 4 +- .../core/runnables/test_network_receiver.py | 6 +- .../proto/edu/uci/ics/amber/core/__init__.py | 108 ++++++++++++ .../amber/engine/architecture/rpc/__init__.py | 55 +++--- .../architecture/sendsemantics/__init__.py | 12 +- .../engine/architecture/worker/__init__.py | 4 +- .../uci/ics/amber/engine/common/__init__.py | 94 +--------- .../src/main/python/proto/scalapb/__init__.py | 164 +++++++++--------- core/scripts/python-proto-gen.sh | 11 +- 28 files changed, 265 insertions(+), 245 deletions(-) create mode 100644 core/amber/src/main/python/proto/edu/uci/ics/amber/core/__init__.py diff --git a/core/amber/src/main/python/core/architecture/handlers/actorcommand/backpressure_handler.py b/core/amber/src/main/python/core/architecture/handlers/actorcommand/backpressure_handler.py index e1171cf8870..d96331795fe 100644 --- a/core/amber/src/main/python/core/architecture/handlers/actorcommand/backpressure_handler.py +++ b/core/amber/src/main/python/core/architecture/handlers/actorcommand/backpressure_handler.py @@ -13,9 +13,9 @@ from proto.edu.uci.ics.amber.engine.common import ( Backpressure, - ActorVirtualIdentity, ControlPayloadV2, ) +from proto.edu.uci.ics.amber.core import ActorVirtualIdentity class BackpressureHandler(ActorCommandHandler): diff --git a/core/amber/src/main/python/core/architecture/managers/pause_manager.py b/core/amber/src/main/python/core/architecture/managers/pause_manager.py index 4ddf2b7a85d..7ecc2631a18 100644 --- a/core/amber/src/main/python/core/architecture/managers/pause_manager.py +++ b/core/amber/src/main/python/core/architecture/managers/pause_manager.py @@ -7,7 +7,7 @@ from . import state_manager from proto.edu.uci.ics.amber.engine.architecture.worker import WorkerState -from proto.edu.uci.ics.amber.engine.common import ActorVirtualIdentity +from proto.edu.uci.ics.amber.core import ActorVirtualIdentity from ...models import InternalQueue diff --git a/core/amber/src/main/python/core/architecture/managers/statistics_manager.py b/core/amber/src/main/python/core/architecture/managers/statistics_manager.py index f615f66eb53..0dead5a347a 100644 --- a/core/amber/src/main/python/core/architecture/managers/statistics_manager.py +++ b/core/amber/src/main/python/core/architecture/managers/statistics_manager.py @@ -1,7 +1,7 @@ from typing import Dict from collections import defaultdict -from proto.edu.uci.ics.amber.engine.common import PortIdentity +from proto.edu.uci.ics.amber.core import PortIdentity from proto.edu.uci.ics.amber.engine.architecture.worker import ( WorkerStatistics, PortTupleCountMapping, diff --git a/core/amber/src/main/python/core/architecture/managers/tuple_processing_manager.py b/core/amber/src/main/python/core/architecture/managers/tuple_processing_manager.py index c217d5fe372..0b4ddb6871b 100644 --- a/core/amber/src/main/python/core/architecture/managers/tuple_processing_manager.py +++ b/core/amber/src/main/python/core/architecture/managers/tuple_processing_manager.py @@ -1,7 +1,7 @@ from threading import Event, Condition from typing import Optional, Tuple, Iterator -from proto.edu.uci.ics.amber.engine.common import PortIdentity +from proto.edu.uci.ics.amber.core import PortIdentity class TupleProcessingManager: diff --git a/core/amber/src/main/python/core/architecture/packaging/input_manager.py b/core/amber/src/main/python/core/architecture/packaging/input_manager.py index 4a50fccea81..1c52e797aac 100644 --- a/core/amber/src/main/python/core/architecture/packaging/input_manager.py +++ b/core/amber/src/main/python/core/architecture/packaging/input_manager.py @@ -11,7 +11,7 @@ ) from core.models.marker import EndOfInputChannel, State, StartOfInputChannel, Marker from core.models.payload import DataFrame, DataPayload, MarkerFrame -from proto.edu.uci.ics.amber.engine.common import ( +from proto.edu.uci.ics.amber.core import ( ActorVirtualIdentity, PortIdentity, ChannelIdentity, diff --git a/core/amber/src/main/python/core/architecture/packaging/output_manager.py b/core/amber/src/main/python/core/architecture/packaging/output_manager.py index e7592e0ab45..bdeac6bc367 100644 --- a/core/amber/src/main/python/core/architecture/packaging/output_manager.py +++ b/core/amber/src/main/python/core/architecture/packaging/output_manager.py @@ -32,7 +32,7 @@ RangeBasedShufflePartitioning, BroadcastPartitioning, ) -from proto.edu.uci.ics.amber.engine.common import ( +from proto.edu.uci.ics.amber.core import ( ActorVirtualIdentity, PhysicalLink, PortIdentity, diff --git a/core/amber/src/main/python/core/architecture/rpc/async_rpc_client.py b/core/amber/src/main/python/core/architecture/rpc/async_rpc_client.py index 6bd7c8a9cfb..91e4e4186e6 100644 --- a/core/amber/src/main/python/core/architecture/rpc/async_rpc_client.py +++ b/core/amber/src/main/python/core/architecture/rpc/async_rpc_client.py @@ -18,10 +18,8 @@ WorkerServiceStub, ControlRequest, ) -from proto.edu.uci.ics.amber.engine.common import ( - ActorVirtualIdentity, - ControlPayloadV2, -) +from proto.edu.uci.ics.amber.engine.common import ControlPayloadV2 +from proto.edu.uci.ics.amber.core import ActorVirtualIdentity R = TypeVar("R") diff --git a/core/amber/src/main/python/core/architecture/rpc/async_rpc_server.py b/core/amber/src/main/python/core/architecture/rpc/async_rpc_server.py index b214a0d5f2d..727c7dd6ac9 100644 --- a/core/amber/src/main/python/core/architecture/rpc/async_rpc_server.py +++ b/core/amber/src/main/python/core/architecture/rpc/async_rpc_server.py @@ -15,10 +15,8 @@ ControlError, ErrorLanguage, ) -from proto.edu.uci.ics.amber.engine.common import ( - ActorVirtualIdentity, - ControlPayloadV2, -) +from proto.edu.uci.ics.amber.engine.common import ControlPayloadV2 +from proto.edu.uci.ics.amber.core import ActorVirtualIdentity class AsyncRPCServer: diff --git a/core/amber/src/main/python/core/architecture/sendsemantics/broad_cast_partitioner.py b/core/amber/src/main/python/core/architecture/sendsemantics/broad_cast_partitioner.py index 407172975f0..c5ca0fad369 100644 --- a/core/amber/src/main/python/core/architecture/sendsemantics/broad_cast_partitioner.py +++ b/core/amber/src/main/python/core/architecture/sendsemantics/broad_cast_partitioner.py @@ -11,7 +11,7 @@ Partitioning, BroadcastPartitioning, ) -from proto.edu.uci.ics.amber.engine.common import ActorVirtualIdentity +from proto.edu.uci.ics.amber.core import ActorVirtualIdentity class BroadcastPartitioner(Partitioner): diff --git a/core/amber/src/main/python/core/architecture/sendsemantics/hash_based_shuffle_partitioner.py b/core/amber/src/main/python/core/architecture/sendsemantics/hash_based_shuffle_partitioner.py index f4e0942768c..775bd94b028 100644 --- a/core/amber/src/main/python/core/architecture/sendsemantics/hash_based_shuffle_partitioner.py +++ b/core/amber/src/main/python/core/architecture/sendsemantics/hash_based_shuffle_partitioner.py @@ -11,7 +11,7 @@ HashBasedShufflePartitioning, Partitioning, ) -from proto.edu.uci.ics.amber.engine.common import ActorVirtualIdentity +from proto.edu.uci.ics.amber.core import ActorVirtualIdentity class HashBasedShufflePartitioner(Partitioner): diff --git a/core/amber/src/main/python/core/architecture/sendsemantics/one_to_one_partitioner.py b/core/amber/src/main/python/core/architecture/sendsemantics/one_to_one_partitioner.py index 1758363c0cb..81e623ab6a0 100644 --- a/core/amber/src/main/python/core/architecture/sendsemantics/one_to_one_partitioner.py +++ b/core/amber/src/main/python/core/architecture/sendsemantics/one_to_one_partitioner.py @@ -10,7 +10,7 @@ OneToOnePartitioning, Partitioning, ) -from proto.edu.uci.ics.amber.engine.common import ActorVirtualIdentity +from proto.edu.uci.ics.amber.core import ActorVirtualIdentity class OneToOnePartitioner(Partitioner): diff --git a/core/amber/src/main/python/core/architecture/sendsemantics/partitioner.py b/core/amber/src/main/python/core/architecture/sendsemantics/partitioner.py index e2ff2df34c7..2220b350abc 100644 --- a/core/amber/src/main/python/core/architecture/sendsemantics/partitioner.py +++ b/core/amber/src/main/python/core/architecture/sendsemantics/partitioner.py @@ -8,7 +8,7 @@ from core.models.marker import Marker from core.util import get_one_of from proto.edu.uci.ics.amber.engine.architecture.sendsemantics import Partitioning -from proto.edu.uci.ics.amber.engine.common import ActorVirtualIdentity +from proto.edu.uci.ics.amber.core import ActorVirtualIdentity class Partitioner(ABC): diff --git a/core/amber/src/main/python/core/architecture/sendsemantics/range_based_shuffle_partitioner.py b/core/amber/src/main/python/core/architecture/sendsemantics/range_based_shuffle_partitioner.py index 31d0ccc6f87..dee786c649a 100644 --- a/core/amber/src/main/python/core/architecture/sendsemantics/range_based_shuffle_partitioner.py +++ b/core/amber/src/main/python/core/architecture/sendsemantics/range_based_shuffle_partitioner.py @@ -12,7 +12,7 @@ RangeBasedShufflePartitioning, Partitioning, ) -from proto.edu.uci.ics.amber.engine.common import ActorVirtualIdentity +from proto.edu.uci.ics.amber.core import ActorVirtualIdentity class RangeBasedShufflePartitioner(Partitioner): diff --git a/core/amber/src/main/python/core/architecture/sendsemantics/round_robin_partitioner.py b/core/amber/src/main/python/core/architecture/sendsemantics/round_robin_partitioner.py index 47011051f4e..4baa1463193 100644 --- a/core/amber/src/main/python/core/architecture/sendsemantics/round_robin_partitioner.py +++ b/core/amber/src/main/python/core/architecture/sendsemantics/round_robin_partitioner.py @@ -11,7 +11,7 @@ Partitioning, RoundRobinPartitioning, ) -from proto.edu.uci.ics.amber.engine.common import ActorVirtualIdentity +from proto.edu.uci.ics.amber.core import ActorVirtualIdentity class RoundRobinPartitioner(Partitioner): diff --git a/core/amber/src/main/python/core/models/internal_marker.py b/core/amber/src/main/python/core/models/internal_marker.py index 78ed5c60513..1f21f731d79 100644 --- a/core/amber/src/main/python/core/models/internal_marker.py +++ b/core/amber/src/main/python/core/models/internal_marker.py @@ -1,6 +1,6 @@ from dataclasses import dataclass from core.models.marker import Marker -from proto.edu.uci.ics.amber.engine.common import ChannelIdentity +from proto.edu.uci.ics.amber.core import ChannelIdentity @dataclass diff --git a/core/amber/src/main/python/core/models/internal_queue.py b/core/amber/src/main/python/core/models/internal_queue.py index 36e271983e8..ae22ba134e9 100644 --- a/core/amber/src/main/python/core/models/internal_queue.py +++ b/core/amber/src/main/python/core/models/internal_queue.py @@ -11,7 +11,8 @@ LinkedBlockingMultiQueue, ) from core.util.customized_queue.queue_base import IQueue, QueueElement -from proto.edu.uci.ics.amber.engine.common import ActorVirtualIdentity, ControlPayloadV2 +from proto.edu.uci.ics.amber.core import ActorVirtualIdentity +from proto.edu.uci.ics.amber.engine.common import ControlPayloadV2 @dataclass diff --git a/core/amber/src/main/python/core/runnables/main_loop.py b/core/amber/src/main/python/core/runnables/main_loop.py index 845afe0d21a..0b66450162d 100644 --- a/core/amber/src/main/python/core/runnables/main_loop.py +++ b/core/amber/src/main/python/core/runnables/main_loop.py @@ -39,11 +39,8 @@ from proto.edu.uci.ics.amber.engine.architecture.worker import ( WorkerState, ) -from proto.edu.uci.ics.amber.engine.common import ( - ActorVirtualIdentity, - ControlPayloadV2, - PortIdentity, -) +from proto.edu.uci.ics.amber.engine.common import ControlPayloadV2 +from proto.edu.uci.ics.amber.core import ActorVirtualIdentity, PortIdentity class MainLoop(StoppableQueueBlockingRunnable): diff --git a/core/amber/src/main/python/core/runnables/network_sender.py b/core/amber/src/main/python/core/runnables/network_sender.py index 031f2783902..5ce2c7c7f95 100644 --- a/core/amber/src/main/python/core/runnables/network_sender.py +++ b/core/amber/src/main/python/core/runnables/network_sender.py @@ -9,11 +9,11 @@ from core.proxy import ProxyClient from core.util import StoppableQueueBlockingRunnable from proto.edu.uci.ics.amber.engine.common import ( - ActorVirtualIdentity, ControlPayloadV2, PythonControlMessage, PythonDataHeader, ) +from proto.edu.uci.ics.amber.core import ActorVirtualIdentity class NetworkSender(StoppableQueueBlockingRunnable): diff --git a/core/amber/src/main/python/core/runnables/test_console_message.py b/core/amber/src/main/python/core/runnables/test_console_message.py index a643a789855..2ff4373f7d7 100644 --- a/core/amber/src/main/python/core/runnables/test_console_message.py +++ b/core/amber/src/main/python/core/runnables/test_console_message.py @@ -10,10 +10,10 @@ ConsoleMessageType, ) from proto.edu.uci.ics.amber.engine.common import ( - ActorVirtualIdentity, ControlPayloadV2, PythonControlMessage, ) +from proto.edu.uci.ics.amber.core import ActorVirtualIdentity class TestConsoleMessage: diff --git a/core/amber/src/main/python/core/runnables/test_main_loop.py b/core/amber/src/main/python/core/runnables/test_main_loop.py index 05cfdf9362b..77981fade9b 100644 --- a/core/amber/src/main/python/core/runnables/test_main_loop.py +++ b/core/amber/src/main/python/core/runnables/test_main_loop.py @@ -42,15 +42,15 @@ WorkerStatistics, PortTupleCountMapping, ) -from proto.edu.uci.ics.amber.engine.common import ( +from proto.edu.uci.ics.amber.core import ( ActorVirtualIdentity, - ControlPayloadV2, PhysicalLink, PhysicalOpIdentity, OperatorIdentity, ChannelIdentity, PortIdentity, ) +from proto.edu.uci.ics.amber.engine.common import ControlPayloadV2 from pytexera.udf.examples.count_batch_operator import CountBatchOperator from pytexera.udf.examples.echo_operator import EchoOperator from google.protobuf.any_pb2 import Any as ProtoAny diff --git a/core/amber/src/main/python/core/runnables/test_network_receiver.py b/core/amber/src/main/python/core/runnables/test_network_receiver.py index f2ca1d640c8..cfba03c7fee 100644 --- a/core/amber/src/main/python/core/runnables/test_network_receiver.py +++ b/core/amber/src/main/python/core/runnables/test_network_receiver.py @@ -11,10 +11,8 @@ from core.runnables.network_sender import NetworkSender from core.util.proto import set_one_of from proto.edu.uci.ics.amber.engine.architecture.rpc import ControlInvocation -from proto.edu.uci.ics.amber.engine.common import ( - ActorVirtualIdentity, - ControlPayloadV2, -) +from proto.edu.uci.ics.amber.engine.common import ControlPayloadV2 +from proto.edu.uci.ics.amber.core import ActorVirtualIdentity class TestNetworkReceiver: diff --git a/core/amber/src/main/python/proto/edu/uci/ics/amber/core/__init__.py b/core/amber/src/main/python/proto/edu/uci/ics/amber/core/__init__.py new file mode 100644 index 00000000000..0cb9940da1d --- /dev/null +++ b/core/amber/src/main/python/proto/edu/uci/ics/amber/core/__init__.py @@ -0,0 +1,108 @@ +# Generated by the protocol buffer compiler. DO NOT EDIT! +# sources: edu/uci/ics/amber/core/virtualidentity.proto, edu/uci/ics/amber/core/workflow.proto, edu/uci/ics/amber/core/workflowruntimestate.proto +# plugin: python-betterproto +# This file has been @generated + +from dataclasses import dataclass +from datetime import datetime +from typing import List + +import betterproto + + +class OutputPortOutputMode(betterproto.Enum): + SET_SNAPSHOT = 0 + """outputs complete result set snapshot for each update""" + + SET_DELTA = 1 + """outputs incremental result set delta for each update""" + + SINGLE_SNAPSHOT = 2 + """ + outputs a single snapshot for the entire execution, + used explicitly to support visualization operators that may exceed the memory limit + TODO: remove this mode after we have a better solution for output size limit + """ + + +class FatalErrorType(betterproto.Enum): + COMPILATION_ERROR = 0 + EXECUTION_FAILURE = 1 + + +@dataclass(eq=False, repr=False) +class WorkflowIdentity(betterproto.Message): + id: int = betterproto.int64_field(1) + + +@dataclass(eq=False, repr=False) +class ExecutionIdentity(betterproto.Message): + id: int = betterproto.int64_field(1) + + +@dataclass(eq=False, repr=False) +class ActorVirtualIdentity(betterproto.Message): + name: str = betterproto.string_field(1) + + +@dataclass(eq=False, repr=False) +class ChannelIdentity(betterproto.Message): + from_worker_id: "ActorVirtualIdentity" = betterproto.message_field(1) + to_worker_id: "ActorVirtualIdentity" = betterproto.message_field(2) + is_control: bool = betterproto.bool_field(3) + + +@dataclass(eq=False, repr=False) +class OperatorIdentity(betterproto.Message): + id: str = betterproto.string_field(1) + + +@dataclass(eq=False, repr=False) +class PhysicalOpIdentity(betterproto.Message): + logical_op_id: "OperatorIdentity" = betterproto.message_field(1) + layer_name: str = betterproto.string_field(2) + + +@dataclass(eq=False, repr=False) +class ChannelMarkerIdentity(betterproto.Message): + id: str = betterproto.string_field(1) + + +@dataclass(eq=False, repr=False) +class PortIdentity(betterproto.Message): + id: int = betterproto.int32_field(1) + internal: bool = betterproto.bool_field(2) + + +@dataclass(eq=False, repr=False) +class InputPort(betterproto.Message): + id: "PortIdentity" = betterproto.message_field(1) + display_name: str = betterproto.string_field(2) + allow_multi_links: bool = betterproto.bool_field(3) + dependencies: List["PortIdentity"] = betterproto.message_field(4) + + +@dataclass(eq=False, repr=False) +class OutputPort(betterproto.Message): + id: "PortIdentity" = betterproto.message_field(1) + display_name: str = betterproto.string_field(2) + blocking: bool = betterproto.bool_field(3) + mode: "OutputPortOutputMode" = betterproto.enum_field(4) + + +@dataclass(eq=False, repr=False) +class PhysicalLink(betterproto.Message): + from_op_id: "PhysicalOpIdentity" = betterproto.message_field(1) + from_port_id: "PortIdentity" = betterproto.message_field(2) + to_op_id: "PhysicalOpIdentity" = betterproto.message_field(3) + to_port_id: "PortIdentity" = betterproto.message_field(4) + + +@dataclass(eq=False, repr=False) +class WorkflowFatalError(betterproto.Message): + type: "FatalErrorType" = betterproto.enum_field(1) + timestamp: datetime = betterproto.message_field(2) + message: str = betterproto.string_field(3) + details: str = betterproto.string_field(4) + operator_id: str = betterproto.string_field(5) + worker_id: str = betterproto.string_field(6) diff --git a/core/amber/src/main/python/proto/edu/uci/ics/amber/engine/architecture/rpc/__init__.py b/core/amber/src/main/python/proto/edu/uci/ics/amber/engine/architecture/rpc/__init__.py index e2e066fff2e..676292d9605 100644 --- a/core/amber/src/main/python/proto/edu/uci/ics/amber/engine/architecture/rpc/__init__.py +++ b/core/amber/src/main/python/proto/edu/uci/ics/amber/engine/architecture/rpc/__init__.py @@ -17,7 +17,7 @@ import grpclib from betterproto.grpc.grpclib_server import ServiceBase -from ... import common as __common__ +from .... import core as ___core__ from .. import ( sendsemantics as _sendsemantics__, worker as _worker__, @@ -146,8 +146,8 @@ class EmptyRequest(betterproto.Message): @dataclass(eq=False, repr=False) class AsyncRpcContext(betterproto.Message): - sender: "__common__.ActorVirtualIdentity" = betterproto.message_field(1) - receiver: "__common__.ActorVirtualIdentity" = betterproto.message_field(2) + sender: "___core__.ActorVirtualIdentity" = betterproto.message_field(1) + receiver: "___core__.ActorVirtualIdentity" = betterproto.message_field(2) @dataclass(eq=False, repr=False) @@ -162,9 +162,9 @@ class ControlInvocation(betterproto.Message): class ChannelMarkerPayload(betterproto.Message): """Message for ChannelMarkerPayload""" - id: "__common__.ChannelMarkerIdentity" = betterproto.message_field(1) + id: "___core__.ChannelMarkerIdentity" = betterproto.message_field(1) marker_type: "ChannelMarkerType" = betterproto.enum_field(2) - scope: List["__common__.ChannelIdentity"] = betterproto.message_field(3) + scope: List["___core__.ChannelIdentity"] = betterproto.message_field(3) command_mapping: Dict[str, "ControlInvocation"] = betterproto.map_field( 4, betterproto.TYPE_STRING, betterproto.TYPE_MESSAGE ) @@ -172,13 +172,13 @@ class ChannelMarkerPayload(betterproto.Message): @dataclass(eq=False, repr=False) class PropagateChannelMarkerRequest(betterproto.Message): - source_op_to_start_prop: List["__common__.PhysicalOpIdentity"] = ( + source_op_to_start_prop: List["___core__.PhysicalOpIdentity"] = ( betterproto.message_field(1) ) - id: "__common__.ChannelMarkerIdentity" = betterproto.message_field(2) + id: "___core__.ChannelMarkerIdentity" = betterproto.message_field(2) marker_type: "ChannelMarkerType" = betterproto.enum_field(3) - scope: List["__common__.PhysicalOpIdentity"] = betterproto.message_field(4) - target_ops: List["__common__.PhysicalOpIdentity"] = betterproto.message_field(5) + scope: List["___core__.PhysicalOpIdentity"] = betterproto.message_field(4) + target_ops: List["___core__.PhysicalOpIdentity"] = betterproto.message_field(5) marker_command: "ControlRequest" = betterproto.message_field(6) marker_method_name: str = betterproto.string_field(7) @@ -186,7 +186,7 @@ class PropagateChannelMarkerRequest(betterproto.Message): @dataclass(eq=False, repr=False) class TakeGlobalCheckpointRequest(betterproto.Message): estimation_only: bool = betterproto.bool_field(1) - checkpoint_id: "__common__.ChannelMarkerIdentity" = betterproto.message_field(2) + checkpoint_id: "___core__.ChannelMarkerIdentity" = betterproto.message_field(2) destination: str = betterproto.string_field(3) @@ -215,7 +215,7 @@ class ModifyLogicRequest(betterproto.Message): @dataclass(eq=False, repr=False) class RetryWorkflowRequest(betterproto.Message): - workers: List["__common__.ActorVirtualIdentity"] = betterproto.message_field(1) + workers: List["___core__.ActorVirtualIdentity"] = betterproto.message_field(1) @dataclass(eq=False, repr=False) @@ -235,7 +235,7 @@ class ConsoleMessageTriggeredRequest(betterproto.Message): @dataclass(eq=False, repr=False) class PortCompletedRequest(betterproto.Message): - port_id: "__common__.PortIdentity" = betterproto.message_field(1) + port_id: "___core__.PortIdentity" = betterproto.message_field(1) input: bool = betterproto.bool_field(2) @@ -246,7 +246,7 @@ class WorkerStateUpdatedRequest(betterproto.Message): @dataclass(eq=False, repr=False) class LinkWorkersRequest(betterproto.Message): - link: "__common__.PhysicalLink" = betterproto.message_field(1) + link: "___core__.PhysicalLink" = betterproto.message_field(1) @dataclass(eq=False, repr=False) @@ -255,7 +255,7 @@ class Ping(betterproto.Message): i: int = betterproto.int32_field(1) end: int = betterproto.int32_field(2) - to: "__common__.ActorVirtualIdentity" = betterproto.message_field(3) + to: "___core__.ActorVirtualIdentity" = betterproto.message_field(3) @dataclass(eq=False, repr=False) @@ -264,7 +264,7 @@ class Pong(betterproto.Message): i: int = betterproto.int32_field(1) end: int = betterproto.int32_field(2) - to: "__common__.ActorVirtualIdentity" = betterproto.message_field(3) + to: "___core__.ActorVirtualIdentity" = betterproto.message_field(3) @dataclass(eq=False, repr=False) @@ -285,7 +285,7 @@ class Nested(betterproto.Message): class MultiCall(betterproto.Message): """MultiCall message""" - seq: List["__common__.ActorVirtualIdentity"] = betterproto.message_field(1) + seq: List["___core__.ActorVirtualIdentity"] = betterproto.message_field(1) @dataclass(eq=False, repr=False) @@ -299,7 +299,7 @@ class ErrorCommand(betterproto.Message): class Collect(betterproto.Message): """Collect message""" - workers: List["__common__.ActorVirtualIdentity"] = betterproto.message_field(1) + workers: List["___core__.ActorVirtualIdentity"] = betterproto.message_field(1) @dataclass(eq=False, repr=False) @@ -313,7 +313,7 @@ class GenerateNumber(betterproto.Message): class Chain(betterproto.Message): """Chain message""" - nexts: List["__common__.ActorVirtualIdentity"] = betterproto.message_field(1) + nexts: List["___core__.ActorVirtualIdentity"] = betterproto.message_field(1) @dataclass(eq=False, repr=False) @@ -327,19 +327,19 @@ class Recursion(betterproto.Message): class AddInputChannelRequest(betterproto.Message): """Messages for the commands""" - channel_id: "__common__.ChannelIdentity" = betterproto.message_field(1) - port_id: "__common__.PortIdentity" = betterproto.message_field(2) + channel_id: "___core__.ChannelIdentity" = betterproto.message_field(1) + port_id: "___core__.PortIdentity" = betterproto.message_field(2) @dataclass(eq=False, repr=False) class AddPartitioningRequest(betterproto.Message): - tag: "__common__.PhysicalLink" = betterproto.message_field(1) + tag: "___core__.PhysicalLink" = betterproto.message_field(1) partitioning: "_sendsemantics__.Partitioning" = betterproto.message_field(2) @dataclass(eq=False, repr=False) class AssignPortRequest(betterproto.Message): - port_id: "__common__.PortIdentity" = betterproto.message_field(1) + port_id: "___core__.PortIdentity" = betterproto.message_field(1) input: bool = betterproto.bool_field(2) schema: Dict[str, str] = betterproto.map_field( 3, betterproto.TYPE_STRING, betterproto.TYPE_STRING @@ -348,7 +348,7 @@ class AssignPortRequest(betterproto.Message): @dataclass(eq=False, repr=False) class FinalizeCheckpointRequest(betterproto.Message): - checkpoint_id: "__common__.ChannelMarkerIdentity" = betterproto.message_field(1) + checkpoint_id: "___core__.ChannelMarkerIdentity" = betterproto.message_field(1) write_to: str = betterproto.string_field(2) @@ -364,7 +364,7 @@ class InitializeExecutorRequest(betterproto.Message): @dataclass(eq=False, repr=False) class UpdateExecutorRequest(betterproto.Message): - target_op_id: "__common__.PhysicalOpIdentity" = betterproto.message_field(1) + target_op_id: "___core__.PhysicalOpIdentity" = betterproto.message_field(1) new_executor: "betterproto_lib_google_protobuf.Any" = betterproto.message_field(2) state_transfer_func: "betterproto_lib_google_protobuf.Any" = ( betterproto.message_field(3) @@ -373,13 +373,13 @@ class UpdateExecutorRequest(betterproto.Message): @dataclass(eq=False, repr=False) class PrepareCheckpointRequest(betterproto.Message): - checkpoint_id: "__common__.ChannelMarkerIdentity" = betterproto.message_field(1) + checkpoint_id: "___core__.ChannelMarkerIdentity" = betterproto.message_field(1) estimation_only: bool = betterproto.bool_field(2) @dataclass(eq=False, repr=False) class QueryStatisticsRequest(betterproto.Message): - filter_by_workers: List["__common__.ActorVirtualIdentity"] = ( + filter_by_workers: List["___core__.ActorVirtualIdentity"] = ( betterproto.message_field(1) ) @@ -1235,6 +1235,7 @@ async def retry_workflow( class RpcTesterBase(ServiceBase): + async def send_ping(self, ping: "Ping") -> "IntResponse": raise grpclib.GRPCError(grpclib.const.Status.UNIMPLEMENTED) @@ -1405,6 +1406,7 @@ def __mapping__(self) -> Dict[str, grpclib.const.Handler]: class WorkerServiceBase(ServiceBase): + async def add_input_channel( self, add_input_channel_request: "AddInputChannelRequest" ) -> "EmptyReturn": @@ -1711,6 +1713,7 @@ def __mapping__(self) -> Dict[str, grpclib.const.Handler]: class ControllerServiceBase(ServiceBase): + async def retrieve_workflow_state( self, empty_request: "EmptyRequest" ) -> "RetrieveWorkflowStateResponse": diff --git a/core/amber/src/main/python/proto/edu/uci/ics/amber/engine/architecture/sendsemantics/__init__.py b/core/amber/src/main/python/proto/edu/uci/ics/amber/engine/architecture/sendsemantics/__init__.py index b862b7ea3c1..b9769dc2bb9 100644 --- a/core/amber/src/main/python/proto/edu/uci/ics/amber/engine/architecture/sendsemantics/__init__.py +++ b/core/amber/src/main/python/proto/edu/uci/ics/amber/engine/architecture/sendsemantics/__init__.py @@ -8,7 +8,7 @@ import betterproto -from ... import common as __common__ +from .... import core as ___core__ @dataclass(eq=False, repr=False) @@ -33,26 +33,26 @@ class Partitioning(betterproto.Message): @dataclass(eq=False, repr=False) class OneToOnePartitioning(betterproto.Message): batch_size: int = betterproto.int32_field(1) - channels: List["__common__.ChannelIdentity"] = betterproto.message_field(2) + channels: List["___core__.ChannelIdentity"] = betterproto.message_field(2) @dataclass(eq=False, repr=False) class RoundRobinPartitioning(betterproto.Message): batch_size: int = betterproto.int32_field(1) - channels: List["__common__.ChannelIdentity"] = betterproto.message_field(2) + channels: List["___core__.ChannelIdentity"] = betterproto.message_field(2) @dataclass(eq=False, repr=False) class HashBasedShufflePartitioning(betterproto.Message): batch_size: int = betterproto.int32_field(1) - channels: List["__common__.ChannelIdentity"] = betterproto.message_field(2) + channels: List["___core__.ChannelIdentity"] = betterproto.message_field(2) hash_attribute_names: List[str] = betterproto.string_field(3) @dataclass(eq=False, repr=False) class RangeBasedShufflePartitioning(betterproto.Message): batch_size: int = betterproto.int32_field(1) - channels: List["__common__.ChannelIdentity"] = betterproto.message_field(2) + channels: List["___core__.ChannelIdentity"] = betterproto.message_field(2) range_attribute_names: List[str] = betterproto.string_field(3) range_min: int = betterproto.int64_field(4) range_max: int = betterproto.int64_field(5) @@ -61,4 +61,4 @@ class RangeBasedShufflePartitioning(betterproto.Message): @dataclass(eq=False, repr=False) class BroadcastPartitioning(betterproto.Message): batch_size: int = betterproto.int32_field(1) - channels: List["__common__.ChannelIdentity"] = betterproto.message_field(2) + channels: List["___core__.ChannelIdentity"] = betterproto.message_field(2) diff --git a/core/amber/src/main/python/proto/edu/uci/ics/amber/engine/architecture/worker/__init__.py b/core/amber/src/main/python/proto/edu/uci/ics/amber/engine/architecture/worker/__init__.py index 4f7c35a6e96..344972b1060 100644 --- a/core/amber/src/main/python/proto/edu/uci/ics/amber/engine/architecture/worker/__init__.py +++ b/core/amber/src/main/python/proto/edu/uci/ics/amber/engine/architecture/worker/__init__.py @@ -8,7 +8,7 @@ import betterproto -from ... import common as __common__ +from .... import core as ___core__ class WorkerState(betterproto.Enum): @@ -21,7 +21,7 @@ class WorkerState(betterproto.Enum): @dataclass(eq=False, repr=False) class PortTupleCountMapping(betterproto.Message): - port_id: "__common__.PortIdentity" = betterproto.message_field(1) + port_id: "___core__.PortIdentity" = betterproto.message_field(1) tuple_count: int = betterproto.int64_field(2) diff --git a/core/amber/src/main/python/proto/edu/uci/ics/amber/engine/common/__init__.py b/core/amber/src/main/python/proto/edu/uci/ics/amber/engine/common/__init__.py index 1c38e3cc6cf..7d1e19c8f8e 100644 --- a/core/amber/src/main/python/proto/edu/uci/ics/amber/engine/common/__init__.py +++ b/core/amber/src/main/python/proto/edu/uci/ics/amber/engine/common/__init__.py @@ -1,10 +1,9 @@ # Generated by the protocol buffer compiler. DO NOT EDIT! -# sources: edu/uci/ics/amber/engine/common/actormessage.proto, edu/uci/ics/amber/engine/common/ambermessage.proto, edu/uci/ics/amber/engine/common/virtualidentity.proto, edu/uci/ics/amber/engine/common/workflow.proto, edu/uci/ics/amber/engine/common/workflowruntimestate.proto +# sources: edu/uci/ics/amber/engine/common/actormessage.proto, edu/uci/ics/amber/engine/common/ambermessage.proto, edu/uci/ics/amber/engine/common/executionruntimestate.proto # plugin: python-betterproto # This file has been @generated from dataclasses import dataclass -from datetime import datetime from typing import ( Dict, List, @@ -12,84 +11,13 @@ import betterproto +from ... import core as __core__ from ..architecture import ( rpc as _architecture_rpc__, worker as _architecture_worker__, ) -class FatalErrorType(betterproto.Enum): - COMPILATION_ERROR = 0 - EXECUTION_FAILURE = 1 - - -@dataclass(eq=False, repr=False) -class WorkflowIdentity(betterproto.Message): - id: int = betterproto.int64_field(1) - - -@dataclass(eq=False, repr=False) -class ExecutionIdentity(betterproto.Message): - id: int = betterproto.int64_field(1) - - -@dataclass(eq=False, repr=False) -class ActorVirtualIdentity(betterproto.Message): - name: str = betterproto.string_field(1) - - -@dataclass(eq=False, repr=False) -class ChannelIdentity(betterproto.Message): - from_worker_id: "ActorVirtualIdentity" = betterproto.message_field(1) - to_worker_id: "ActorVirtualIdentity" = betterproto.message_field(2) - is_control: bool = betterproto.bool_field(3) - - -@dataclass(eq=False, repr=False) -class OperatorIdentity(betterproto.Message): - id: str = betterproto.string_field(1) - - -@dataclass(eq=False, repr=False) -class PhysicalOpIdentity(betterproto.Message): - logical_op_id: "OperatorIdentity" = betterproto.message_field(1) - layer_name: str = betterproto.string_field(2) - - -@dataclass(eq=False, repr=False) -class ChannelMarkerIdentity(betterproto.Message): - id: str = betterproto.string_field(1) - - -@dataclass(eq=False, repr=False) -class PortIdentity(betterproto.Message): - id: int = betterproto.int32_field(1) - internal: bool = betterproto.bool_field(2) - - -@dataclass(eq=False, repr=False) -class InputPort(betterproto.Message): - id: "PortIdentity" = betterproto.message_field(1) - display_name: str = betterproto.string_field(2) - allow_multi_links: bool = betterproto.bool_field(3) - dependencies: List["PortIdentity"] = betterproto.message_field(4) - - -@dataclass(eq=False, repr=False) -class OutputPort(betterproto.Message): - id: "PortIdentity" = betterproto.message_field(1) - display_name: str = betterproto.string_field(2) - blocking: bool = betterproto.bool_field(3) - - -@dataclass(eq=False, repr=False) -class PhysicalLink(betterproto.Message): - from_op_id: "PhysicalOpIdentity" = betterproto.message_field(1) - from_port_id: "PortIdentity" = betterproto.message_field(2) - to_op_id: "PhysicalOpIdentity" = betterproto.message_field(3) - to_port_id: "PortIdentity" = betterproto.message_field(4) - - @dataclass(eq=False, repr=False) class ControlPayloadV2(betterproto.Message): control_invocation: "_architecture_rpc__.ControlInvocation" = ( @@ -102,13 +30,13 @@ class ControlPayloadV2(betterproto.Message): @dataclass(eq=False, repr=False) class PythonDataHeader(betterproto.Message): - tag: "ActorVirtualIdentity" = betterproto.message_field(1) + tag: "__core__.ActorVirtualIdentity" = betterproto.message_field(1) payload_type: str = betterproto.string_field(2) @dataclass(eq=False, repr=False) class PythonControlMessage(betterproto.Message): - tag: "ActorVirtualIdentity" = betterproto.message_field(1) + tag: "__core__.ActorVirtualIdentity" = betterproto.message_field(1) payload: "ControlPayloadV2" = betterproto.message_field(2) @@ -199,21 +127,11 @@ class ExecutionStatsStore(betterproto.Message): ) -@dataclass(eq=False, repr=False) -class WorkflowFatalError(betterproto.Message): - type: "FatalErrorType" = betterproto.enum_field(1) - timestamp: datetime = betterproto.message_field(2) - message: str = betterproto.string_field(3) - details: str = betterproto.string_field(4) - operator_id: str = betterproto.string_field(5) - worker_id: str = betterproto.string_field(6) - - @dataclass(eq=False, repr=False) class ExecutionMetadataStore(betterproto.Message): state: "_architecture_rpc__.WorkflowAggregatedState" = betterproto.enum_field(1) - fatal_errors: List["WorkflowFatalError"] = betterproto.message_field(2) - execution_id: "ExecutionIdentity" = betterproto.message_field(3) + fatal_errors: List["__core__.WorkflowFatalError"] = betterproto.message_field(2) + execution_id: "__core__.ExecutionIdentity" = betterproto.message_field(3) is_recovering: bool = betterproto.bool_field(4) diff --git a/core/amber/src/main/python/proto/scalapb/__init__.py b/core/amber/src/main/python/proto/scalapb/__init__.py index 51a1655804e..49c713815a5 100644 --- a/core/amber/src/main/python/proto/scalapb/__init__.py +++ b/core/amber/src/main/python/proto/scalapb/__init__.py @@ -21,8 +21,7 @@ class MatchType(betterproto.Enum): class ScalaPbOptionsOptionsScope(betterproto.Enum): """ - Whether to apply the options only to this file, or for the entire package - (and its subpackages) + Whether to apply the options only to this file, or for the entire package (and its subpackages) """ FILE = 0 @@ -46,63 +45,63 @@ class ScalaPbOptions(betterproto.Message): flat_package: bool = betterproto.bool_field(2) """ - If true, the compiler does not append the proto base file name into the - generated package name. If false (the default), the generated scala package - name is the package_name.basename where basename is the proto file name - without the .proto extension. + If true, the compiler does not append the proto base file name + into the generated package name. If false (the default), the + generated scala package name is the package_name.basename where + basename is the proto file name without the .proto extension. """ import_: List[str] = betterproto.string_field(3) """ - Adds the following imports at the top of the file (this is meant to provide - implicit TypeMappers) + Adds the following imports at the top of the file (this is meant + to provide implicit TypeMappers) """ preamble: List[str] = betterproto.string_field(4) """ - Text to add to the generated scala file. This can be used only when - single_file is true. + Text to add to the generated scala file. This can be used only + when single_file is true. """ single_file: bool = betterproto.bool_field(5) """ - If true, all messages and enums (but not services) will be written to a - single Scala file. + If true, all messages and enums (but not services) will be written + to a single Scala file. """ no_primitive_wrappers: bool = betterproto.bool_field(7) """ - By default, wrappers defined at https://github.com/google/protobuf/blob/mas - ter/src/google/protobuf/wrappers.proto, are mapped to an Option[T] where T - is a primitive type. When this field is set to true, we do not perform this - transformation. + By default, wrappers defined at + https://github.com/google/protobuf/blob/master/src/google/protobuf/wrappers.proto, + are mapped to an Option[T] where T is a primitive type. When this field + is set to true, we do not perform this transformation. """ primitive_wrappers: bool = betterproto.bool_field(6) """ DEPRECATED. In ScalaPB <= 0.5.47, it was necessary to explicitly enable - primitive_wrappers. This field remains here for backwards compatibility, - but it has no effect on generated code. It is an error to set both - `primitive_wrappers` and `no_primitive_wrappers`. + primitive_wrappers. This field remains here for backwards compatibility, + but it has no effect on generated code. It is an error to set both + `primitive_wrappers` and `no_primitive_wrappers`. """ collection_type: str = betterproto.string_field(8) """ Scala type to be used for repeated fields. If unspecified, - `scala.collection.Seq` will be used. + `scala.collection.Seq` will be used. """ preserve_unknown_fields: bool = betterproto.bool_field(9) """ If set to true, all generated messages in this file will preserve unknown - fields. + fields. """ object_name: str = betterproto.string_field(10) """ - If defined, sets the name of the file-level object that would be generated. - This object extends `GeneratedFileObject` and contains descriptors, and - list of message and enum companions. + If defined, sets the name of the file-level object that would be generated. This + object extends `GeneratedFileObject` and contains descriptors, and list of message + and enum companions. """ scope: "ScalaPbOptionsOptionsScope" = betterproto.enum_field(11) @@ -114,15 +113,15 @@ class ScalaPbOptions(betterproto.Message): retain_source_code_info: bool = betterproto.bool_field(13) """ If true, then source-code info information will be included in the - generated code - normally the source code info is cleared out to reduce - code size. The source code info is useful for extracting source code - location from the descriptors as well as comments. + generated code - normally the source code info is cleared out to reduce + code size. The source code info is useful for extracting source code + location from the descriptors as well as comments. """ map_type: str = betterproto.string_field(14) """ Scala type to be used for maps. If unspecified, - `scala.collection.immutable.Map` will be used. + `scala.collection.immutable.Map` will be used. """ no_default_values_in_constructor: bool = betterproto.bool_field(15) @@ -133,8 +132,8 @@ class ScalaPbOptions(betterproto.Message): enum_value_naming: "ScalaPbOptionsEnumValueNaming" = betterproto.enum_field(16) enum_strip_prefix: bool = betterproto.bool_field(17) """ - Indicate if prefix (enum name + optional underscore) should be removed in - scala code Strip is applied before enum value naming changes. + Indicate if prefix (enum name + optional underscore) should be removed in scala code + Strip is applied before enum value naming changes. """ bytes_type: str = betterproto.string_field(21) @@ -169,9 +168,8 @@ class ScalaPbOptions(betterproto.Message): field_transformations: List["FieldTransformation"] = betterproto.message_field(25) ignore_all_transformations: bool = betterproto.bool_field(26) """ - Ignores all transformations for this file. This is meant to allow specific - files to opt out from transformations inherited through package-scoped - options. + Ignores all transformations for this file. This is meant to allow specific files to + opt out from transformations inherited through package-scoped options. """ getters: bool = betterproto.bool_field(27) @@ -179,17 +177,17 @@ class ScalaPbOptions(betterproto.Message): test_only_no_java_conversions: bool = betterproto.bool_field(999) """ - For use in tests only. Inhibit Java conversions even when when generator - parameters request for it. + For use in tests only. Inhibit Java conversions even when when generator parameters + request for it. """ @dataclass(eq=False, repr=False) class ScalaPbOptionsAuxMessageOptions(betterproto.Message): """ - AuxMessageOptions enables you to set message-level options through package- - scoped options. This is useful when you can't add a dependency on - scalapb.proto from the proto file that defines the message. + AuxMessageOptions enables you to set message-level options through package-scoped options. + This is useful when you can't add a dependency on scalapb.proto from the proto file that + defines the message. """ target: str = betterproto.string_field(1) @@ -197,17 +195,17 @@ class ScalaPbOptionsAuxMessageOptions(betterproto.Message): options: "MessageOptions" = betterproto.message_field(2) """ - Options to apply to the message. If there are any options defined on the - target message they take precedence over the options. + Options to apply to the message. If there are any options defined on the target message + they take precedence over the options. """ @dataclass(eq=False, repr=False) class ScalaPbOptionsAuxFieldOptions(betterproto.Message): """ - AuxFieldOptions enables you to set field-level options through package- - scoped options. This is useful when you can't add a dependency on - scalapb.proto from the proto file that defines the field. + AuxFieldOptions enables you to set field-level options through package-scoped options. + This is useful when you can't add a dependency on scalapb.proto from the proto file that + defines the field. """ target: str = betterproto.string_field(1) @@ -215,17 +213,17 @@ class ScalaPbOptionsAuxFieldOptions(betterproto.Message): options: "FieldOptions" = betterproto.message_field(2) """ - Options to apply to the field. If there are any options defined on the - target message they take precedence over the options. + Options to apply to the field. If there are any options defined on the target message + they take precedence over the options. """ @dataclass(eq=False, repr=False) class ScalaPbOptionsAuxEnumOptions(betterproto.Message): """ - AuxEnumOptions enables you to set enum-level options through package-scoped - options. This is useful when you can't add a dependency on scalapb.proto - from the proto file that defines the enum. + AuxEnumOptions enables you to set enum-level options through package-scoped options. + This is useful when you can't add a dependency on scalapb.proto from the proto file that + defines the enum. """ target: str = betterproto.string_field(1) @@ -233,17 +231,17 @@ class ScalaPbOptionsAuxEnumOptions(betterproto.Message): options: "EnumOptions" = betterproto.message_field(2) """ - Options to apply to the enum. If there are any options defined on the - target enum they take precedence over the options. + Options to apply to the enum. If there are any options defined on the target enum + they take precedence over the options. """ @dataclass(eq=False, repr=False) class ScalaPbOptionsAuxEnumValueOptions(betterproto.Message): """ - AuxEnumValueOptions enables you to set enum value level options through - package-scoped options. This is useful when you can't add a dependency on - scalapb.proto from the proto file that defines the enum. + AuxEnumValueOptions enables you to set enum value level options through package-scoped + options. This is useful when you can't add a dependency on scalapb.proto from the proto + file that defines the enum. """ target: str = betterproto.string_field(1) @@ -251,8 +249,8 @@ class ScalaPbOptionsAuxEnumValueOptions(betterproto.Message): options: "EnumValueOptions" = betterproto.message_field(2) """ - Options to apply to the enum value. If there are any options defined on the - target enum value they take precedence over the options. + Options to apply to the enum value. If there are any options defined on + the target enum value they take precedence over the options. """ @@ -269,8 +267,8 @@ class MessageOptions(betterproto.Message): type: str = betterproto.string_field(4) """ - All instances of this message will be converted to this type. An implicit - TypeMapper must be present. + All instances of this message will be converted to this type. An implicit TypeMapper + must be present. """ companion_annotations: List[str] = betterproto.string_field(5) @@ -280,30 +278,26 @@ class MessageOptions(betterproto.Message): sealed_oneof_extends: List[str] = betterproto.string_field(6) """ - Additional classes and traits to mix in to generated sealed_oneof base - trait. + Additional classes and traits to mix in to generated sealed_oneof base trait. """ no_box: bool = betterproto.bool_field(7) """ - If true, when this message is used as an optional field, do not wrap it in - an `Option`. This is equivalent of setting `(field).no_box` to true on each - field with the message type. + If true, when this message is used as an optional field, do not wrap it in an `Option`. + This is equivalent of setting `(field).no_box` to true on each field with the message type. """ unknown_fields_annotations: List[str] = betterproto.string_field(8) """ - Custom annotations to add to the generated `unknownFields` case class - field. + Custom annotations to add to the generated `unknownFields` case class field. """ @dataclass(eq=False, repr=False) class Collection(betterproto.Message): """ - Represents a custom Collection type in Scala. This allows ScalaPB to - integrate with collection types that are different enough from the ones in - the standard library. + Represents a custom Collection type in Scala. This allows ScalaPB to integrate with + collection types that are different enough from the ones in the standard library. """ type: str = betterproto.string_field(1) @@ -312,14 +306,14 @@ class Collection(betterproto.Message): non_empty: bool = betterproto.bool_field(2) """ Set to true if this collection type is not allowed to be empty, for example - cats.data.NonEmptyList. When true, ScalaPB will not generate `clearX` for - the repeated field and not provide a default argument in the constructor. + cats.data.NonEmptyList. When true, ScalaPB will not generate `clearX` for the repeated + field and not provide a default argument in the constructor. """ adapter: str = betterproto.string_field(3) """ - An Adapter is a Scala object available at runtime that provides certain - static methods that can operate on this collection type. + An Adapter is a Scala object available at runtime that provides certain static methods + that can operate on this collection type. """ @@ -329,16 +323,16 @@ class FieldOptions(betterproto.Message): scala_name: str = betterproto.string_field(2) collection_type: str = betterproto.string_field(3) """ - Can be specified only if this field is repeated. If unspecified, it falls - back to the file option named `collection_type`, which defaults to - `scala.collection.Seq`. + Can be specified only if this field is repeated. If unspecified, + it falls back to the file option named `collection_type`, which defaults + to `scala.collection.Seq`. """ collection: "Collection" = betterproto.message_field(8) key_type: str = betterproto.string_field(4) """ - If the field is a map, you can specify custom Scala types for the key or - value. + If the field is a map, you can specify custom Scala types for the key + or value. """ value_type: str = betterproto.string_field(5) @@ -347,22 +341,20 @@ class FieldOptions(betterproto.Message): map_type: str = betterproto.string_field(7) """ - Can be specified only if this field is a map. If unspecified, it falls back - to the file option named `map_type` which defaults to - `scala.collection.immutable.Map` + Can be specified only if this field is a map. If unspecified, + it falls back to the file option named `map_type` which defaults to + `scala.collection.immutable.Map` """ no_box: bool = betterproto.bool_field(30) """ - Do not box this value in Option[T]. If set, this overrides - MessageOptions.no_box + Do not box this value in Option[T]. If set, this overrides MessageOptions.no_box """ required: bool = betterproto.bool_field(31) """ - Like no_box it does not box a value in Option[T], but also fails parsing - when a value is not provided. This enables to emulate required fields in - proto3. + Like no_box it does not box a value in Option[T], but also fails parsing when a value + is not provided. This enables to emulate required fields in proto3. """ @@ -376,8 +368,8 @@ class EnumOptions(betterproto.Message): type: str = betterproto.string_field(3) """ - All instances of this enum will be converted to this type. An implicit - TypeMapper must be present. + All instances of this enum will be converted to this type. An implicit TypeMapper + must be present. """ base_annotations: List[str] = betterproto.string_field(4) diff --git a/core/scripts/python-proto-gen.sh b/core/scripts/python-proto-gen.sh index 21db91b5b0d..135be499521 100755 --- a/core/scripts/python-proto-gen.sh +++ b/core/scripts/python-proto-gen.sh @@ -4,7 +4,14 @@ TEXERA_ROOT="$(git rev-parse --show-toplevel)" AMBER_DIR="$TEXERA_ROOT/core/amber" PYAMBER_DIR="$AMBER_DIR/src/main/python" -PROTOBUF_DIR="$AMBER_DIR/src/main/protobuf" +PROTOBUF_AMBER_DIR="$AMBER_DIR/src/main/protobuf" + +CORE_DIR="$TEXERA_ROOT/core/workflow-core" +PROTOBUF_CORE_DIR="$CORE_DIR/src/main/protobuf" # proto-gen -protoc --python_betterproto_out="$PYAMBER_DIR/proto" -I="$PROTOBUF_DIR" $(find "$PROTOBUF_DIR" -iname "*.proto") --proto_path="$PROTOBUF_DIR" \ No newline at end of file +protoc --python_betterproto_out="$PYAMBER_DIR/proto" \ + -I="$PROTOBUF_AMBER_DIR" \ + -I="$PROTOBUF_CORE_DIR" \ + $(find "$PROTOBUF_AMBER_DIR" -iname "*.proto") \ + $(find "$PROTOBUF_CORE_DIR" -iname "*.proto") From 461789079f9d76037fbd8ed080dca3ada908a3ff Mon Sep 17 00:00:00 2001 From: Yicong Huang <17627829+Yicong-Huang@users.noreply.github.com> Date: Tue, 31 Dec 2024 01:22:32 -0800 Subject: [PATCH 03/10] Make OpExecInitInfo serializable (#3183) Previously, creating a physical operator during compilation also required the creation of its corresponding executor instances. To delay this process so that executor instances were created within workers, we used a lambda function (in `OpExecInitInfo`). However, the lambda approach had a critical limitation: it was not serializable and it is language dependent. This PR addresses this issue by replacing the lambda functions in `OpExecInitInfo` with fully serializable Protobuf entities. The serialized information now ensures compatibility with distributed environments and is language-independent. Two primary types of `OpExecInitInfo` are introduced: 1. **`OpExecWithClassName`**: - **Fields**: `className: String`, `descString: String`. - **Behavior**: The language compiler dynamically loads the class specified by `className` and uses `descString` as its initialization argument. 2. **`OpExecWithCode`**: - **Fields**: `code: String`, `language: String`. - **Behavior**: The language compiler compiles the provided `code` based on the specified `language`. The arguments are already pre-populated into the code string. ### Special Cases The `ProgressiveSink` and `CacheSource` executors are treated as special cases. These executors require additional unique information (e.g., `storageKey`, `workflowIdentity`, `outputMode`) to initialize their executor instances. While this PR preserves the handling of these special cases, these executors will eventually be refactored or removed as part of the plan to move storage management to the port layer. --- .../architecture/rpc/controlcommands.proto | 4 +- .../control/initialize_executor_handler.py | 6 +- .../architecture/managers/executor_manager.py | 1 - .../python/core/runnables/test_main_loop.py | 35 ++++---- .../proto/edu/uci/ics/amber/core/__init__.py | 39 ++++++++- .../amber/engine/architecture/rpc/__init__.py | 5 +- core/amber/src/main/resources/cluster.conf | 1 - .../pythonworker/PythonProxyClient.scala | 28 +----- .../RegionExecutionCoordinator.scala | 12 +-- .../managers/SerializationManager.scala | 31 +++---- .../InitializeExecutorHandler.scala | 29 ++++--- .../uci/ics/texera/workflow/LogicalPlan.scala | 2 +- .../texera/workflow/WorkflowCompiler.scala | 2 +- .../architecture/worker/WorkerSpec.scala | 35 ++++---- .../amber/compiler/model/LogicalPlan.scala | 7 +- .../edu/uci/ics/amber/core/executor.proto | 44 ++++++++++ .../ics/amber/core/executor/ExecFactory.scala | 44 ++++++++++ .../amber/core/executor/OpExecInitInfo.scala | 54 ------------ .../ics/amber/core/storage/FileResolver.scala | 18 ++++ .../ics/amber/core/workflow/PhysicalOp.scala | 42 +-------- .../operator/PythonOperatorDescriptor.scala | 10 +-- .../operator/SpecialPhysicalOpFactory.scala | 19 ++-- .../ics/amber/operator/TestOperators.scala | 4 +- .../operator/aggregate/AggregateOpDesc.scala | 72 ++++++++------- .../operator/aggregate/AggregateOpExec.scala | 18 ++-- .../CartesianProductOpDesc.scala | 9 +- .../dictionary/DictionaryMatcherOpDesc.scala | 8 +- .../dictionary/DictionaryMatcherOpExec.scala | 19 ++-- .../difference/DifferenceOpDesc.scala | 4 +- .../operator/distinct/DistinctOpDesc.scala | 4 +- .../filter/SpecializedFilterOpDesc.scala | 8 +- .../filter/SpecializedFilterOpExec.scala | 8 +- .../hashJoin/HashJoinBuildOpExec.scala | 8 +- .../operator/hashJoin/HashJoinOpDesc.scala | 16 ++-- .../hashJoin/HashJoinProbeOpExec.scala | 21 +++-- .../operator/intersect/IntersectOpDesc.scala | 4 +- .../intervalJoin/IntervalJoinOpDesc.scala | 15 ++-- .../intervalJoin/IntervalJoinOpExec.scala | 63 +++++++------- .../keywordSearch/KeywordSearchOpDesc.scala | 8 +- .../keywordSearch/KeywordSearchOpExec.scala | 12 ++- .../amber/operator/limit/LimitOpDesc.scala | 10 ++- .../amber/operator/limit/LimitOpExec.scala | 6 +- .../projection/ProjectionOpDesc.scala | 8 +- .../projection/ProjectionOpExec.scala | 13 +-- .../RandomKSamplingOpDesc.scala | 15 ++-- .../RandomKSamplingOpExec.scala | 11 ++- .../amber/operator/regex/RegexOpDesc.scala | 8 +- .../amber/operator/regex/RegexOpExec.scala | 9 +- .../ReservoirSamplingOpDesc.scala | 15 ++-- .../ReservoirSamplingOpExec.scala | 15 ++-- .../sentiment/SentimentAnalysisOpDesc.scala | 8 +- .../sentiment/SentimentAnalysisOpExec.java | 7 +- .../{managed => }/ProgressiveSinkOpExec.scala | 8 +- .../sortPartitions/SortPartitionsOpDesc.scala | 15 ++-- ...pExec.scala => SortPartitionsOpExec.scala} | 16 ++-- .../source/SourceOperatorDescriptor.scala | 5 +- .../apis/twitter/TwitterSourceOpExec.scala | 13 +-- ...TwitterFullArchiveSearchSourceOpDesc.scala | 17 ++-- ...TwitterFullArchiveSearchSourceOpExec.scala | 27 +++--- .../v2/TwitterSearchSourceOpDesc.scala | 15 ++-- .../v2/TwitterSearchSourceOpExec.scala | 20 ++--- .../source/fetcher/URLFetcherOpDesc.scala | 10 ++- .../source/fetcher/URLFetcherOpExec.scala | 11 +-- .../source/scan/FileScanSourceOpDesc.scala | 22 ++--- .../source/scan/FileScanSourceOpExec.scala | 35 ++++---- .../source/scan/ScanSourceOpDesc.scala | 18 ++-- .../source/scan/arrow/ArrowSourceOpDesc.scala | 20 ++--- .../source/scan/arrow/ArrowSourceOpExec.scala | 24 +++-- .../source/scan/csv/CSVScanSourceOpDesc.scala | 40 +++------ .../source/scan/csv/CSVScanSourceOpExec.scala | 32 +++---- .../csv/ParallelCSVScanSourceOpDesc.scala | 45 +++------- .../csv/ParallelCSVScanSourceOpExec.scala | 35 +++++--- .../scan/csvOld/CSVOldScanSourceOpDesc.scala | 39 +++------ .../scan/csvOld/CSVOldScanSourceOpExec.scala | 38 ++++---- .../scan/json/JSONLScanSourceOpDesc.scala | 48 +++------- .../scan/json/JSONLScanSourceOpExec.scala | 39 +++++---- .../scan/text/TextInputSourceOpDesc.scala | 8 +- .../scan/text/TextInputSourceOpExec.scala | 23 +++-- .../operator/source/sql/SQLSourceOpDesc.scala | 24 ++--- .../operator/source/sql/SQLSourceOpExec.scala | 78 ++++++++--------- .../sql/asterixdb/AsterixDBSourceOpDesc.scala | 51 +++-------- .../sql/asterixdb/AsterixDBSourceOpExec.scala | 87 +++++++------------ .../source/sql/mysql/MySQLSourceOpDesc.scala | 26 ++---- .../source/sql/mysql/MySQLSourceOpExec.scala | 50 +++-------- .../postgresql/PostgreSQLSourceOpDesc.scala | 26 ++---- .../postgresql/PostgreSQLSourceOpExec.scala | 48 +++------- .../amber/operator/split/SplitOpDesc.scala | 8 +- .../amber/operator/split/SplitOpExec.scala | 10 +-- .../SymmetricDifferenceOpDesc.scala | 6 +- .../typecasting/TypeCastingOpDesc.scala | 8 +- .../typecasting/TypeCastingOpExec.scala | 9 +- .../operator/udf/java/JavaUDFOpDesc.scala | 6 +- .../DualInputPortsPythonUDFOpDescV2.scala | 6 +- .../udf/python/PythonUDFOpDescV2.scala | 6 +- .../source/PythonUDFSourceOpDescV2.scala | 5 +- .../ics/amber/operator/udf/r/RUDFOpDesc.scala | 6 +- .../operator/udf/r/RUDFSourceOpDesc.scala | 10 ++- .../amber/operator/union/UnionOpDesc.scala | 4 +- .../unneststring/UnnestStringOpDesc.scala | 8 +- .../unneststring/UnnestStringOpExec.scala | 10 ++- .../visualization/htmlviz/HtmlVizOpDesc.scala | 8 +- .../visualization/htmlviz/HtmlVizOpExec.scala | 6 +- .../visualization/urlviz/UrlVizOpDesc.scala | 10 ++- .../visualization/urlviz/UrlVizOpExec.scala | 7 +- .../DictionaryMatcherOpExecSpec.scala | 37 ++++---- .../filter/SpecializedFilterOpExecSpec.scala | 50 ++++------- .../operator/hashJoin/HashJoinOpSpec.scala | 27 +++--- .../intervalJoin/IntervalOpExecSpec.scala | 26 ++---- .../KeywordSearchOpExecSpec.scala | 62 +++++++++---- .../projection/ProjectionOpExecSpec.scala | 73 +++++++--------- .../SortPartitionsOpExecSpec.scala | 13 ++- .../source/fetcher/URLFetcherOpExecSpec.scala | 11 ++- .../scan/csv/CSVScanSourceOpDescSpec.scala | 22 ++--- .../scan/text/FileScanSourceOpDescSpec.scala | 74 +++++----------- .../scan/text/TextInputSourceOpDescSpec.scala | 19 +++- .../typecasting/TypeCastingOpExecSpec.scala | 8 +- .../unneststring/UnnestStringOpExecSpec.scala | 18 ++-- .../htmlviz/HtmlVizOpExecSpec.scala | 12 +-- 118 files changed, 1161 insertions(+), 1331 deletions(-) create mode 100644 core/workflow-core/src/main/protobuf/edu/uci/ics/amber/core/executor.proto create mode 100644 core/workflow-core/src/main/scala/edu/uci/ics/amber/core/executor/ExecFactory.scala delete mode 100644 core/workflow-core/src/main/scala/edu/uci/ics/amber/core/executor/OpExecInitInfo.scala rename core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sink/{managed => }/ProgressiveSinkOpExec.scala (89%) rename core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sortPartitions/{SortPartitionOpExec.scala => SortPartitionsOpExec.scala} (77%) diff --git a/core/amber/src/main/protobuf/edu/uci/ics/amber/engine/architecture/rpc/controlcommands.proto b/core/amber/src/main/protobuf/edu/uci/ics/amber/engine/architecture/rpc/controlcommands.proto index 81f7f4b21ba..5df9e7ab47b 100644 --- a/core/amber/src/main/protobuf/edu/uci/ics/amber/engine/architecture/rpc/controlcommands.proto +++ b/core/amber/src/main/protobuf/edu/uci/ics/amber/engine/architecture/rpc/controlcommands.proto @@ -3,6 +3,7 @@ package edu.uci.ics.amber.engine.architecture.rpc; import "edu/uci/ics/amber/core/virtualidentity.proto"; import "edu/uci/ics/amber/core/workflow.proto"; +import "edu/uci/ics/amber/core/executor.proto"; import "edu/uci/ics/amber/engine/architecture/worker/statistics.proto"; import "edu/uci/ics/amber/engine/architecture/sendsemantics/partitionings.proto"; import "scalapb/scalapb.proto"; @@ -235,9 +236,8 @@ message FinalizeCheckpointRequest { message InitializeExecutorRequest { int32 totalWorkerCount = 1; - google.protobuf.Any opExecInitInfo = 2 [(scalapb.field).no_box = true]; + core.OpExecInitInfo opExecInitInfo = 2; bool isSource = 3; - string language = 4; } message UpdateExecutorRequest { diff --git a/core/amber/src/main/python/core/architecture/handlers/control/initialize_executor_handler.py b/core/amber/src/main/python/core/architecture/handlers/control/initialize_executor_handler.py index fbff3b09418..9ae770fca13 100644 --- a/core/amber/src/main/python/core/architecture/handlers/control/initialize_executor_handler.py +++ b/core/amber/src/main/python/core/architecture/handlers/control/initialize_executor_handler.py @@ -1,4 +1,6 @@ from core.architecture.handlers.control.control_handler_base import ControlHandler +from core.util import get_one_of +from proto.edu.uci.ics.amber.core import OpExecWithCode from proto.edu.uci.ics.amber.engine.architecture.rpc import ( EmptyReturn, InitializeExecutorRequest, @@ -8,8 +10,8 @@ class InitializeExecutorHandler(ControlHandler): async def initialize_executor(self, req: InitializeExecutorRequest) -> EmptyReturn: - code = req.op_exec_init_info.value.decode("utf-8") + op_exec_with_code: OpExecWithCode = get_one_of(req.op_exec_init_info) self.context.executor_manager.initialize_executor( - code, req.is_source, req.language + op_exec_with_code.code, req.is_source, op_exec_with_code.language ) return EmptyReturn() diff --git a/core/amber/src/main/python/core/architecture/managers/executor_manager.py b/core/amber/src/main/python/core/architecture/managers/executor_manager.py index 0ab6fd9c33e..238e6c3f9a1 100644 --- a/core/amber/src/main/python/core/architecture/managers/executor_manager.py +++ b/core/amber/src/main/python/core/architecture/managers/executor_manager.py @@ -114,7 +114,6 @@ def initialize_executor(self, code: str, is_source: bool, language: str) -> None class declaration. :param is_source: Indicating if the operator is used as a source operator. :param language: The language of the operator code. - :param output_schema: the raw mapping of output schema, name -> type_str. :return: """ if language == "r-tuple": diff --git a/core/amber/src/main/python/core/runnables/test_main_loop.py b/core/amber/src/main/python/core/runnables/test_main_loop.py index 77981fade9b..910149b06a1 100644 --- a/core/amber/src/main/python/core/runnables/test_main_loop.py +++ b/core/amber/src/main/python/core/runnables/test_main_loop.py @@ -1,10 +1,10 @@ import inspect +import pickle from threading import Thread import pandas import pyarrow import pytest -import pickle from core.models import ( DataFrame, @@ -16,6 +16,16 @@ from core.models.marker import EndOfInputChannel from core.runnables import MainLoop from core.util import set_one_of +from proto.edu.uci.ics.amber.core import ( + ActorVirtualIdentity, + PhysicalLink, + PhysicalOpIdentity, + OperatorIdentity, + ChannelIdentity, + PortIdentity, + OpExecWithCode, + OpExecInitInfo, +) from proto.edu.uci.ics.amber.engine.architecture.rpc import ( ControlRequest, AssignPortRequest, @@ -42,18 +52,9 @@ WorkerStatistics, PortTupleCountMapping, ) -from proto.edu.uci.ics.amber.core import ( - ActorVirtualIdentity, - PhysicalLink, - PhysicalOpIdentity, - OperatorIdentity, - ChannelIdentity, - PortIdentity, -) from proto.edu.uci.ics.amber.engine.common import ControlPayloadV2 from pytexera.udf.examples.count_batch_operator import CountBatchOperator from pytexera.udf.examples.echo_operator import EchoOperator -from google.protobuf.any_pb2 import Any as ProtoAny class TestMainLoop: @@ -270,13 +271,14 @@ def mock_initialize_executor( command_sequence, mock_raw_schema, ): - proto_any = ProtoAny() + operator_code = "from pytexera import *\n" + inspect.getsource(EchoOperator) - proto_any.value = operator_code.encode("utf-8") command = set_one_of( ControlRequest, InitializeExecutorRequest( - op_exec_init_info=proto_any, + op_exec_init_info=set_one_of( + OpExecInitInfo, OpExecWithCode(operator_code, "python") + ), is_source=False, ), ) @@ -299,15 +301,16 @@ def mock_initialize_batch_count_executor( command_sequence, mock_raw_schema, ): - proto_any = ProtoAny() + operator_code = "from pytexera import *\n" + inspect.getsource( CountBatchOperator ) - proto_any.value = operator_code.encode("utf-8") command = set_one_of( ControlRequest, InitializeExecutorRequest( - op_exec_init_info=proto_any, + op_exec_init_info=set_one_of( + OpExecInitInfo, OpExecWithCode(operator_code, "python") + ), is_source=False, ), ) diff --git a/core/amber/src/main/python/proto/edu/uci/ics/amber/core/__init__.py b/core/amber/src/main/python/proto/edu/uci/ics/amber/core/__init__.py index 0cb9940da1d..31c15ddbc9b 100644 --- a/core/amber/src/main/python/proto/edu/uci/ics/amber/core/__init__.py +++ b/core/amber/src/main/python/proto/edu/uci/ics/amber/core/__init__.py @@ -1,5 +1,5 @@ # Generated by the protocol buffer compiler. DO NOT EDIT! -# sources: edu/uci/ics/amber/core/virtualidentity.proto, edu/uci/ics/amber/core/workflow.proto, edu/uci/ics/amber/core/workflowruntimestate.proto +# sources: edu/uci/ics/amber/core/executor.proto, edu/uci/ics/amber/core/virtualidentity.proto, edu/uci/ics/amber/core/workflow.proto, edu/uci/ics/amber/core/workflowruntimestate.proto # plugin: python-betterproto # This file has been @generated @@ -98,6 +98,43 @@ class PhysicalLink(betterproto.Message): to_port_id: "PortIdentity" = betterproto.message_field(4) +@dataclass(eq=False, repr=False) +class OpExecWithCode(betterproto.Message): + code: str = betterproto.string_field(1) + language: str = betterproto.string_field(2) + + +@dataclass(eq=False, repr=False) +class OpExecWithClassName(betterproto.Message): + class_name: str = betterproto.string_field(1) + desc_string: str = betterproto.string_field(2) + + +@dataclass(eq=False, repr=False) +class OpExecSink(betterproto.Message): + storage_key: str = betterproto.string_field(1) + workflow_identity: "WorkflowIdentity" = betterproto.message_field(2) + output_mode: "OutputPortOutputMode" = betterproto.enum_field(3) + + +@dataclass(eq=False, repr=False) +class OpExecSource(betterproto.Message): + storage_key: str = betterproto.string_field(1) + workflow_identity: "WorkflowIdentity" = betterproto.message_field(2) + + +@dataclass(eq=False, repr=False) +class OpExecInitInfo(betterproto.Message): + op_exec_with_class_name: "OpExecWithClassName" = betterproto.message_field( + 1, group="sealed_value" + ) + op_exec_with_code: "OpExecWithCode" = betterproto.message_field( + 2, group="sealed_value" + ) + op_exec_sink: "OpExecSink" = betterproto.message_field(3, group="sealed_value") + op_exec_source: "OpExecSource" = betterproto.message_field(4, group="sealed_value") + + @dataclass(eq=False, repr=False) class WorkflowFatalError(betterproto.Message): type: "FatalErrorType" = betterproto.enum_field(1) diff --git a/core/amber/src/main/python/proto/edu/uci/ics/amber/engine/architecture/rpc/__init__.py b/core/amber/src/main/python/proto/edu/uci/ics/amber/engine/architecture/rpc/__init__.py index 676292d9605..9320b54e36e 100644 --- a/core/amber/src/main/python/proto/edu/uci/ics/amber/engine/architecture/rpc/__init__.py +++ b/core/amber/src/main/python/proto/edu/uci/ics/amber/engine/architecture/rpc/__init__.py @@ -355,11 +355,8 @@ class FinalizeCheckpointRequest(betterproto.Message): @dataclass(eq=False, repr=False) class InitializeExecutorRequest(betterproto.Message): total_worker_count: int = betterproto.int32_field(1) - op_exec_init_info: "betterproto_lib_google_protobuf.Any" = ( - betterproto.message_field(2) - ) + op_exec_init_info: "___core__.OpExecInitInfo" = betterproto.message_field(2) is_source: bool = betterproto.bool_field(3) - language: str = betterproto.string_field(4) @dataclass(eq=False, repr=False) diff --git a/core/amber/src/main/resources/cluster.conf b/core/amber/src/main/resources/cluster.conf index 67e0e847a97..f3ae050244c 100644 --- a/core/amber/src/main/resources/cluster.conf +++ b/core/amber/src/main/resources/cluster.conf @@ -27,7 +27,6 @@ akka { serialization-bindings { "java.io.Serializable" = kryo "java.lang.Throwable" = akka-misc - "edu.uci.ics.amber.core.executor.OpExecInitInfo" = kryo } } diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/pythonworker/PythonProxyClient.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/pythonworker/PythonProxyClient.scala index 05cb7b0758a..c7dc6400c1e 100644 --- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/pythonworker/PythonProxyClient.scala +++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/pythonworker/PythonProxyClient.scala @@ -1,27 +1,21 @@ package edu.uci.ics.amber.engine.architecture.pythonworker -import com.google.protobuf.ByteString -import com.google.protobuf.any.Any import com.twitter.util.{Await, Promise} import edu.uci.ics.amber.core.WorkflowRuntimeException -import edu.uci.ics.amber.core.executor.{OpExecInitInfo, OpExecInitInfoWithCode} import edu.uci.ics.amber.core.marker.State import edu.uci.ics.amber.core.tuple.{Schema, Tuple} +import edu.uci.ics.amber.core.virtualidentity.ActorVirtualIdentity import edu.uci.ics.amber.engine.architecture.pythonworker.WorkerBatchInternalQueue.{ ActorCommandElement, ControlElement, DataElement } -import edu.uci.ics.amber.engine.architecture.rpc.controlcommands.{ - ControlInvocation, - InitializeExecutorRequest -} +import edu.uci.ics.amber.engine.architecture.rpc.controlcommands.ControlInvocation import edu.uci.ics.amber.engine.architecture.rpc.controlreturns.ReturnInvocation +import edu.uci.ics.amber.engine.common.AmberLogging import edu.uci.ics.amber.engine.common.actormessage.{ActorCommand, PythonActorMessage} import edu.uci.ics.amber.engine.common.ambermessage._ -import edu.uci.ics.amber.engine.common.{AmberLogging, AmberRuntime} import edu.uci.ics.amber.util.ArrowUtils -import edu.uci.ics.amber.core.virtualidentity.ActorVirtualIdentity import org.apache.arrow.flight._ import org.apache.arrow.memory.{ArrowBuf, BufferAllocator, RootAllocator} import org.apache.arrow.vector.VectorSchemaRoot @@ -120,21 +114,7 @@ class PythonProxyClient(portNumberPromise: Promise[Int], val actorId: ActorVirtu var payloadV2 = ControlPayloadV2.defaultInstance payloadV2 = payload match { case c: ControlInvocation => - val req = c.command match { - case InitializeExecutorRequest(worker, info, isSource, _) => - val bytes = info.value.toByteArray - val opExecInitInfo: OpExecInitInfo = - AmberRuntime.serde.deserialize(bytes, classOf[OpExecInitInfo]).get - val (code, language) = opExecInitInfo.asInstanceOf[OpExecInitInfoWithCode].codeGen(0, 0) - InitializeExecutorRequest( - worker, - Any.of("", ByteString.copyFrom(code, "UTF-8")), - isSource, - language - ) - case other => other - } - payloadV2.withControlInvocation(c.withCommand(req)) + payloadV2.withControlInvocation(c) case r: ReturnInvocation => payloadV2.withReturnInvocation(r) case _ => ??? diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala index 8567e0f17bb..3d58c3635e6 100644 --- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala +++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/scheduling/RegionExecutionCoordinator.scala @@ -1,7 +1,5 @@ package edu.uci.ics.amber.engine.architecture.scheduling -import com.google.protobuf.ByteString -import com.google.protobuf.any.Any import com.twitter.util.Future import edu.uci.ics.amber.core.workflow.PhysicalOp import edu.uci.ics.amber.engine.architecture.common.{AkkaActorService, ExecutorDeployment} @@ -25,7 +23,6 @@ import edu.uci.ics.amber.engine.architecture.rpc.controlreturns.{ WorkflowAggregatedState } import edu.uci.ics.amber.engine.architecture.scheduling.config.{OperatorConfig, ResourceConfig} -import edu.uci.ics.amber.engine.common.AmberRuntime import edu.uci.ics.amber.engine.common.rpc.AsyncRPCClient import edu.uci.ics.amber.engine.common.virtualidentity.util.CONTROLLER import edu.uci.ics.amber.core.workflow.PhysicalLink @@ -131,16 +128,11 @@ class RegionExecutionCoordinator( .flatMap(physicalOp => { val workerConfigs = resourceConfig.operatorConfigs(physicalOp.id).workerConfigs workerConfigs.map(_.workerId).map { workerId => - val bytes = AmberRuntime.serde.serialize(physicalOp.opExecInitInfo).get asyncRPCClient.workerInterface.initializeExecutor( InitializeExecutorRequest( workerConfigs.length, - Any.of( - "edu.uci.ics.amber.engine.architecture.deploysemantics.layer.OpExecInitInfo", - ByteString.copyFrom(bytes) - ), - physicalOp.isSourceOperator, - "scala" + physicalOp.opExecInitInfo, + physicalOp.isSourceOperator ), asyncRPCClient.mkContext(workerId) ) diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/worker/managers/SerializationManager.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/worker/managers/SerializationManager.scala index a6ea4f74e4b..5a1015e708d 100644 --- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/worker/managers/SerializationManager.scala +++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/worker/managers/SerializationManager.scala @@ -1,18 +1,12 @@ package edu.uci.ics.amber.engine.architecture.worker.managers -import edu.uci.ics.amber.core.executor.OpExecInitInfo.generateJavaOpExec -import edu.uci.ics.amber.core.executor.{OpExecInitInfo, OperatorExecutor} +import edu.uci.ics.amber.core.executor._ import edu.uci.ics.amber.core.tuple.TupleLike -import edu.uci.ics.amber.engine.architecture.rpc.controlcommands.InitializeExecutorRequest -import edu.uci.ics.amber.engine.common.{ - AmberLogging, - AmberRuntime, - CheckpointState, - CheckpointSupport -} -import edu.uci.ics.amber.util.VirtualIdentityUtils import edu.uci.ics.amber.core.virtualidentity.ActorVirtualIdentity import edu.uci.ics.amber.core.workflow.PortIdentity +import edu.uci.ics.amber.engine.architecture.rpc.controlcommands.InitializeExecutorRequest +import edu.uci.ics.amber.engine.common.{AmberLogging, CheckpointState, CheckpointSupport} +import edu.uci.ics.amber.util.VirtualIdentityUtils class SerializationManager(val actorId: ActorVirtualIdentity) extends AmberLogging { @@ -26,14 +20,15 @@ class SerializationManager(val actorId: ActorVirtualIdentity) extends AmberLoggi def restoreExecutorState( chkpt: CheckpointState ): (OperatorExecutor, Iterator[(TupleLike, Option[PortIdentity])]) = { - val opExecInitInfo: OpExecInitInfo = AmberRuntime.serde - .deserialize(execInitMsg.opExecInitInfo.value.toByteArray, classOf[OpExecInitInfo]) - .get - val executor = generateJavaOpExec( - opExecInitInfo, - VirtualIdentityUtils.getWorkerIndex(actorId), - execInitMsg.totalWorkerCount - ) + val workerIdx = VirtualIdentityUtils.getWorkerIndex(actorId) + val workerCount = execInitMsg.totalWorkerCount + val executor = execInitMsg.opExecInitInfo match { + case OpExecWithClassName(className, descString) => + ExecFactory.newExecFromJavaClassName(className, descString, workerIdx, workerCount) + case OpExecWithCode(code, language) => ExecFactory.newExecFromJavaCode(code) + case _ => throw new UnsupportedOperationException("Unsupported OpExec type") + } + val iter = executor match { case support: CheckpointSupport => support.deserializeState(chkpt) diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/worker/promisehandlers/InitializeExecutorHandler.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/worker/promisehandlers/InitializeExecutorHandler.scala index a27e0ba614d..c727f422da9 100644 --- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/worker/promisehandlers/InitializeExecutorHandler.scala +++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/worker/promisehandlers/InitializeExecutorHandler.scala @@ -1,15 +1,15 @@ package edu.uci.ics.amber.engine.architecture.worker.promisehandlers import com.twitter.util.Future -import edu.uci.ics.amber.core.executor.OpExecInitInfo -import edu.uci.ics.amber.core.executor.OpExecInitInfo.generateJavaOpExec +import edu.uci.ics.amber.core.executor._ import edu.uci.ics.amber.engine.architecture.rpc.controlcommands.{ AsyncRPCContext, InitializeExecutorRequest } import edu.uci.ics.amber.engine.architecture.rpc.controlreturns.EmptyReturn import edu.uci.ics.amber.engine.architecture.worker.DataProcessorRPCHandlerInitializer -import edu.uci.ics.amber.engine.common.AmberRuntime +import edu.uci.ics.amber.operator.sink.ProgressiveSinkOpExec +import edu.uci.ics.amber.operator.source.cache.CacheSourceOpExec import edu.uci.ics.amber.util.VirtualIdentityUtils trait InitializeExecutorHandler { @@ -20,14 +20,21 @@ trait InitializeExecutorHandler { ctx: AsyncRPCContext ): Future[EmptyReturn] = { dp.serializationManager.setOpInitialization(req) - val bytes = req.opExecInitInfo.value.toByteArray - val opExecInitInfo: OpExecInitInfo = - AmberRuntime.serde.deserialize(bytes, classOf[OpExecInitInfo]).get - dp.executor = generateJavaOpExec( - opExecInitInfo, - VirtualIdentityUtils.getWorkerIndex(actorId), - req.totalWorkerCount - ) + val workerIdx = VirtualIdentityUtils.getWorkerIndex(actorId) + val workerCount = req.totalWorkerCount + dp.executor = req.opExecInitInfo match { + case OpExecWithClassName(className, descString) => + ExecFactory.newExecFromJavaClassName(className, descString, workerIdx, workerCount) + case OpExecWithCode(code, _) => ExecFactory.newExecFromJavaCode(code) + case OpExecSink(storageKey, workflowIdentity, outputMode) => + new ProgressiveSinkOpExec( + outputMode, + storageKey, + workflowIdentity + ) + case OpExecSource(storageKey, workflowIdentity) => + new CacheSourceOpExec(storageKey, workflowIdentity) + } EmptyReturn() } diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/LogicalPlan.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/LogicalPlan.scala index a76b4b589af..ca52eb9708c 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/LogicalPlan.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/LogicalPlan.scala @@ -84,7 +84,7 @@ case class LogicalPlan( val fileUri = FileResolver.resolve(fileName) // Convert to URI // Set the URI in the ScanSourceOpDesc - scanOp.setFileUri(fileUri) + scanOp.setResolvedFileName(fileUri) } match { case Success(_) => // Successfully resolved and set the file URI diff --git a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/WorkflowCompiler.scala b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/WorkflowCompiler.scala index 6c13eadbffe..a01a1d3b38b 100644 --- a/core/amber/src/main/scala/edu/uci/ics/texera/workflow/WorkflowCompiler.scala +++ b/core/amber/src/main/scala/edu/uci/ics/texera/workflow/WorkflowCompiler.scala @@ -8,7 +8,7 @@ import edu.uci.ics.amber.engine.common.Utils.objectMapper import edu.uci.ics.amber.operator.SpecialPhysicalOpFactory import edu.uci.ics.amber.core.virtualidentity.OperatorIdentity import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode.SINGLE_SNAPSHOT -import edu.uci.ics.amber.core.workflow.{PhysicalLink, PortIdentity} +import edu.uci.ics.amber.core.workflow.PhysicalLink import edu.uci.ics.texera.web.model.websocket.request.LogicalPlanPojo import edu.uci.ics.texera.web.service.ExecutionsMetadataPersistService diff --git a/core/amber/src/test/scala/edu/uci/ics/amber/engine/architecture/worker/WorkerSpec.scala b/core/amber/src/test/scala/edu/uci/ics/amber/engine/architecture/worker/WorkerSpec.scala index aab2548242a..4fc14016ac1 100644 --- a/core/amber/src/test/scala/edu/uci/ics/amber/engine/architecture/worker/WorkerSpec.scala +++ b/core/amber/src/test/scala/edu/uci/ics/amber/engine/architecture/worker/WorkerSpec.scala @@ -3,10 +3,16 @@ package edu.uci.ics.amber.engine.architecture.worker import akka.actor.{ActorRef, ActorSystem, Props} import akka.serialization.SerializationExtension import akka.testkit.{ImplicitSender, TestActorRef, TestKit} -import com.google.protobuf.ByteString -import com.google.protobuf.any.{Any => ProtoAny} import edu.uci.ics.amber.clustering.SingleNodeListener -import edu.uci.ics.amber.core.executor.{OpExecInitInfo, OperatorExecutor} +import edu.uci.ics.amber.core.executor.{OpExecWithClassName, OperatorExecutor} +import edu.uci.ics.amber.core.tuple._ +import edu.uci.ics.amber.core.virtualidentity.{ + ActorVirtualIdentity, + ChannelIdentity, + OperatorIdentity, + PhysicalOpIdentity +} +import edu.uci.ics.amber.core.workflow.{PhysicalLink, PortIdentity} import edu.uci.ics.amber.engine.architecture.common.WorkflowActor.NetworkMessage import edu.uci.ics.amber.engine.architecture.rpc.controlcommands._ import edu.uci.ics.amber.engine.architecture.rpc.workerservice.WorkerServiceGrpc._ @@ -20,13 +26,6 @@ import edu.uci.ics.amber.engine.common.AmberRuntime import edu.uci.ics.amber.engine.common.ambermessage.{DataFrame, DataPayload, WorkflowFIFOMessage} import edu.uci.ics.amber.engine.common.rpc.AsyncRPCClient import edu.uci.ics.amber.engine.common.virtualidentity.util.CONTROLLER -import edu.uci.ics.amber.core.virtualidentity.{ - ActorVirtualIdentity, - ChannelIdentity, - OperatorIdentity, - PhysicalOpIdentity -} -import edu.uci.ics.amber.core.workflow.{PhysicalLink, PortIdentity} import org.scalamock.scalatest.MockFactory import org.scalatest.BeforeAndAfterAll import org.scalatest.flatspec.AnyFlatSpecLike @@ -35,7 +34,6 @@ import java.util.concurrent.CompletableFuture import scala.collection.mutable import scala.concurrent.duration.MILLISECONDS import scala.util.Random -import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema, Tuple, TupleLike} class DummyOperatorExecutor extends OperatorExecutor { override def processTuple(tuple: Tuple, port: Int): Iterator[TupleLike] = { Iterator(tuple) @@ -168,17 +166,14 @@ class WorkerSpec AsyncRPCContext(CONTROLLER, identifier1), 3 ) - val opInit = OpExecInitInfo((_, _) => { - new DummyOperatorExecutor() - }) - val bytes = AmberRuntime.serde.serialize(opInit).get - val protoAny = ProtoAny.of( - "edu.uci.ics.amber.engine.architecture.deploysemantics.layer.OpExecInitInfo", - ByteString.copyFrom(bytes) - ) + val initializeOperatorLogic = AsyncRPCClient.ControlInvocation( METHOD_INITIALIZE_EXECUTOR, - InitializeExecutorRequest(1, protoAny, isSource = false, "scala"), + InitializeExecutorRequest( + 1, + OpExecWithClassName("edu.uci.ics.amber.engine.architecture.worker.DummyOperatorExecutor"), + isSource = false + ), AsyncRPCContext(CONTROLLER, identifier1), 4 ) diff --git a/core/workflow-compiling-service/src/main/scala/edu/uci/ics/amber/compiler/model/LogicalPlan.scala b/core/workflow-compiling-service/src/main/scala/edu/uci/ics/amber/compiler/model/LogicalPlan.scala index ea79ba5ceb7..db700e8a3df 100644 --- a/core/workflow-compiling-service/src/main/scala/edu/uci/ics/amber/compiler/model/LogicalPlan.scala +++ b/core/workflow-compiling-service/src/main/scala/edu/uci/ics/amber/compiler/model/LogicalPlan.scala @@ -2,10 +2,7 @@ package edu.uci.ics.amber.compiler.model import com.typesafe.scalalogging.LazyLogging import edu.uci.ics.amber.core.storage.FileResolver -import edu.uci.ics.amber.core.tuple.Schema -import edu.uci.ics.amber.core.workflow.WorkflowContext import edu.uci.ics.amber.operator.LogicalOp -import edu.uci.ics.amber.operator.source.SourceOperatorDescriptor import edu.uci.ics.amber.operator.source.scan.ScanSourceOpDesc import edu.uci.ics.amber.core.virtualidentity.OperatorIdentity import edu.uci.ics.amber.core.workflow.PortIdentity @@ -14,8 +11,6 @@ import org.jgrapht.util.SupplierUtil import java.util import scala.collection.mutable.ArrayBuffer -import scala.collection.mutable -import scala.jdk.CollectionConverters.SetHasAsScala import scala.util.{Failure, Success, Try} object LogicalPlan { @@ -107,7 +102,7 @@ case class LogicalPlan( val fileUri = FileResolver.resolve(fileName) // Convert to URI // Set the URI in the ScanSourceOpDesc - scanOp.setFileUri(fileUri) + scanOp.setResolvedFileName(fileUri) } match { case Success(_) => // Successfully resolved and set the file URI case Failure(err) => diff --git a/core/workflow-core/src/main/protobuf/edu/uci/ics/amber/core/executor.proto b/core/workflow-core/src/main/protobuf/edu/uci/ics/amber/core/executor.proto new file mode 100644 index 00000000000..fdc98b2b34f --- /dev/null +++ b/core/workflow-core/src/main/protobuf/edu/uci/ics/amber/core/executor.proto @@ -0,0 +1,44 @@ +syntax = "proto3"; +package edu.uci.ics.amber.core; + + +import "edu/uci/ics/amber/core/virtualidentity.proto"; +import "edu/uci/ics/amber/core/workflow.proto"; +import "scalapb/scalapb.proto"; + +option (scalapb.options) = { + scope: FILE, + preserve_unknown_fields: false + no_default_values_in_constructor: false +}; + + +message OpExecWithCode { + string code = 1; + string language = 2; +} + +message OpExecWithClassName { + string className = 1; + string descString = 2; +} + +message OpExecSink { + string storageKey = 1; + WorkflowIdentity workflowIdentity = 2 [(scalapb.field).no_box = true]; + OutputPort.OutputMode outputMode = 3 [(scalapb.field).no_box = true]; +} + +message OpExecSource { + string storageKey = 1; + WorkflowIdentity workflowIdentity = 2 [(scalapb.field).no_box = true]; +} + +message OpExecInitInfo { + oneof sealed_value { + OpExecWithClassName opExecWithClassName = 1; + OpExecWithCode opExecWithCode = 2; + OpExecSink opExecSink = 3; + OpExecSource opExecSource = 4; + } +} \ No newline at end of file diff --git a/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/executor/ExecFactory.scala b/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/executor/ExecFactory.scala new file mode 100644 index 00000000000..8e53e2ca4ff --- /dev/null +++ b/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/executor/ExecFactory.scala @@ -0,0 +1,44 @@ +package edu.uci.ics.amber.core.executor + +object ExecFactory { + + def newExecFromJavaCode(code: String): OperatorExecutor = { + JavaRuntimeCompilation + .compileCode(code) + .getDeclaredConstructor() + .newInstance() + .asInstanceOf[OperatorExecutor] + } + + def newExecFromJavaClassName[K]( + className: String, + descString: String = "", + idx: Int = 0, + workerCount: Int = 1 + ): OperatorExecutor = { + val clazz = Class.forName(className).asInstanceOf[Class[K]] + try { + if (descString.isEmpty) { + clazz.getDeclaredConstructor().newInstance().asInstanceOf[OperatorExecutor] + } else { + clazz + .getDeclaredConstructor(classOf[String]) + .newInstance(descString) + .asInstanceOf[OperatorExecutor] + } + } catch { + case e: NoSuchMethodException => + if (descString.isEmpty) { + clazz + .getDeclaredConstructor(classOf[Int], classOf[Int]) + .newInstance(idx, workerCount) + .asInstanceOf[OperatorExecutor] + } else { + clazz + .getDeclaredConstructor(classOf[String], classOf[Int], classOf[Int]) + .newInstance(descString, idx, workerCount) + .asInstanceOf[OperatorExecutor] + } + } + } +} diff --git a/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/executor/OpExecInitInfo.scala b/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/executor/OpExecInitInfo.scala deleted file mode 100644 index 2e315e6296a..00000000000 --- a/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/executor/OpExecInitInfo.scala +++ /dev/null @@ -1,54 +0,0 @@ -package edu.uci.ics.amber.core.executor - -object OpExecInitInfo { - - type OpExecFunc = (Int, Int) => OperatorExecutor - type JavaOpExecFunc = - java.util.function.Function[(Int, Int), OperatorExecutor] with java.io.Serializable - - def generateJavaOpExec( - opExecInitInfo: OpExecInitInfo, - workerIdx: Int, - numWorkers: Int - ): OperatorExecutor = { - opExecInitInfo match { - case OpExecInitInfoWithCode(codeGen) => - val (code, _) = - codeGen(workerIdx, numWorkers) - JavaRuntimeCompilation - .compileCode(code) - .getDeclaredConstructor() - .newInstance() - .asInstanceOf[OperatorExecutor] - case OpExecInitInfoWithFunc(opGen) => - opGen( - workerIdx, - numWorkers - ) - } - } - - def apply(code: String, language: String): OpExecInitInfo = - OpExecInitInfoWithCode((_, _) => (code, language)) - def apply(opExecFunc: OpExecFunc): OpExecInitInfo = OpExecInitInfoWithFunc(opExecFunc) - def apply(opExecFunc: JavaOpExecFunc): OpExecInitInfo = - OpExecInitInfoWithFunc((idx, totalWorkerCount) => opExecFunc.apply(idx, totalWorkerCount)) -} - -/** - * Information regarding initializing an operator executor instance - * it could be two cases: - * - OpExecInitInfoWithFunc: - * A function to create an operator executor instance, with parameters: - * 1) the worker index, 2) the PhysicalOp; - * - OpExecInitInfoWithCode: - * A function returning the code string that to be compiled in a virtual machine. - */ -sealed trait OpExecInitInfo - -final case class OpExecInitInfoWithCode( - codeGen: (Int, Int) => (String, String) -) extends OpExecInitInfo -final case class OpExecInitInfoWithFunc( - opGen: (Int, Int) => OperatorExecutor -) extends OpExecInitInfo diff --git a/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/storage/FileResolver.scala b/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/storage/FileResolver.scala index b2c2fafb0ba..c90707a77a8 100644 --- a/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/storage/FileResolver.scala +++ b/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/storage/FileResolver.scala @@ -25,6 +25,9 @@ object FileResolver { * @return Either[String, DatasetFileDocument] - the resolved path as a String or a DatasetFileDocument */ def resolve(fileName: String): URI = { + if (isFileResolved(fileName)) { + return new URI(fileName) + } val resolvers: Seq[String => URI] = Seq(localResolveFunc, datasetResolveFunc) // Try each resolver function in sequence @@ -131,4 +134,19 @@ object FileResolver { throw new FileNotFoundException(s"Dataset file $fileName not found.") } } + + /** + * Checks if a given file path has a valid scheme. + * + * @param filePath The file path to check. + * @return `true` if the file path contains a valid scheme, `false` otherwise. + */ + def isFileResolved(filePath: String): Boolean = { + try { + val uri = new URI(filePath) + uri.getScheme != null && uri.getScheme.nonEmpty + } catch { + case _: Exception => false // Invalid URI format + } + } } diff --git a/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/workflow/PhysicalOp.scala b/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/workflow/PhysicalOp.scala index daf7cd679f9..d493f0891a5 100644 --- a/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/workflow/PhysicalOp.scala +++ b/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/workflow/PhysicalOp.scala @@ -2,7 +2,7 @@ package edu.uci.ics.amber.core.workflow import com.fasterxml.jackson.annotation.{JsonIgnore, JsonIgnoreProperties} import com.typesafe.scalalogging.LazyLogging -import edu.uci.ics.amber.core.executor.{OpExecInitInfo, OpExecInitInfoWithCode} +import edu.uci.ics.amber.core.executor.{OpExecWithCode, OpExecInitInfo} import edu.uci.ics.amber.core.tuple.Schema import edu.uci.ics.amber.core.virtualidentity.{ ExecutionIdentity, @@ -10,7 +10,6 @@ import edu.uci.ics.amber.core.virtualidentity.{ PhysicalOpIdentity, WorkflowIdentity } -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PhysicalLink, PortIdentity} import org.jgrapht.graph.{DefaultEdge, DirectedAcyclicGraph} import org.jgrapht.traverse.TopologicalOrderIterator @@ -84,7 +83,7 @@ object PhysicalOp { executionId: ExecutionIdentity, opExecInitInfo: OpExecInitInfo ): PhysicalOp = - PhysicalOp(physicalOpId, workflowId, executionId, opExecInitInfo = opExecInitInfo) + PhysicalOp(physicalOpId, workflowId, executionId, opExecInitInfo) def manyToOnePhysicalOp( workflowId: WorkflowIdentity, @@ -138,31 +137,6 @@ object PhysicalOp { manyToOnePhysicalOp(physicalOpId, workflowId, executionId, opExecInitInfo) .withLocationPreference(Some(PreferController)) } - - def getExternalPortSchemas( - physicalOp: PhysicalOp, - fromInput: Boolean, - errorList: Option[ArrayBuffer[(OperatorIdentity, Throwable)]] - ): List[Option[Schema]] = { - - // Select either input ports or output ports and filter out the internal ports - val ports = if (fromInput) { - physicalOp.inputPorts.values.filterNot { case (port, _, _) => port.id.internal } - } else { - physicalOp.outputPorts.values.filterNot { case (port, _, _) => port.id.internal } - } - - ports.map { - case (_, _, schema) => - schema match { - case Left(err) => - errorList.foreach(errList => errList.append((physicalOp.id.logicalOpId, err))) - None - case Right(validSchema) => - Some(validSchema) - } - }.toList - } } // @JsonIgnore is not working when directly annotated to fields of a case class @@ -218,8 +192,6 @@ case class PhysicalOp( .toList .distinct - private lazy val isInitWithCode: Boolean = opExecInitInfo.isInstanceOf[OpExecInitInfoWithCode] - /** * Helper functions related to compile-time operations */ @@ -239,20 +211,12 @@ case class PhysicalOp( @JsonIgnore // this is needed to prevent the serialization issue def isPythonBased: Boolean = { opExecInitInfo match { - case opExecInfo: OpExecInitInfoWithCode => - val (_, language) = opExecInfo.codeGen(0, 0) + case OpExecWithCode(_, language) => language == "python" || language == "r-tuple" || language == "r-table" case _ => false } } - @JsonIgnore // this is needed to prevent the serialization issue - def getPythonCode: String = { - val (code, _) = - opExecInitInfo.asInstanceOf[OpExecInitInfoWithCode].codeGen(0, 0) - code - } - /** * creates a copy with the location preference information */ diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/PythonOperatorDescriptor.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/PythonOperatorDescriptor.scala index c5cc4fd152f..941db76f9d5 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/PythonOperatorDescriptor.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/PythonOperatorDescriptor.scala @@ -1,29 +1,27 @@ package edu.uci.ics.amber.operator -import edu.uci.ics.amber.core.executor.OpExecInitInfoWithCode -import edu.uci.ics.amber.core.workflow.{PhysicalOp, SchemaPropagationFunc} +import edu.uci.ics.amber.core.executor.OpExecWithCode import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} +import edu.uci.ics.amber.core.workflow.{PhysicalOp, SchemaPropagationFunc} trait PythonOperatorDescriptor extends LogicalOp { override def getPhysicalOp( workflowId: WorkflowIdentity, executionId: ExecutionIdentity ): PhysicalOp = { - val opExecInitInfo = OpExecInitInfoWithCode((_, _) => (generatePythonCode(), "python")) - val physicalOp = if (asSource()) { PhysicalOp.sourcePhysicalOp( workflowId, executionId, operatorIdentifier, - opExecInitInfo + OpExecWithCode(generatePythonCode(), "python") ) } else { PhysicalOp.oneToOnePhysicalOp( workflowId, executionId, operatorIdentifier, - opExecInitInfo + OpExecWithCode(generatePythonCode(), "python") ) } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/SpecialPhysicalOpFactory.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/SpecialPhysicalOpFactory.scala index 96776d36b62..e60040eb467 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/SpecialPhysicalOpFactory.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/SpecialPhysicalOpFactory.scala @@ -1,12 +1,8 @@ package edu.uci.ics.amber.operator -import edu.uci.ics.amber.core.executor.OpExecInitInfo +import edu.uci.ics.amber.core.executor.{OpExecSink, OpExecSource} import edu.uci.ics.amber.core.storage.result.{OpResultStorage, ResultStorage} import edu.uci.ics.amber.core.tuple.Schema -import edu.uci.ics.amber.core.workflow.{PhysicalOp, SchemaPropagationFunc} -import edu.uci.ics.amber.operator.sink.ProgressiveUtils -import edu.uci.ics.amber.operator.sink.managed.ProgressiveSinkOpExec -import edu.uci.ics.amber.operator.source.cache.CacheSourceOpExec import edu.uci.ics.amber.core.virtualidentity.{ ExecutionIdentity, PhysicalOpIdentity, @@ -18,7 +14,8 @@ import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode.{ SET_SNAPSHOT, SINGLE_SNAPSHOT } -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} +import edu.uci.ics.amber.core.workflow._ +import edu.uci.ics.amber.operator.sink.ProgressiveUtils object SpecialPhysicalOpFactory { def newSinkPhysicalOp( @@ -33,13 +30,7 @@ object SpecialPhysicalOpFactory { PhysicalOpIdentity(opId, s"sink${portId.id}"), workflowIdentity, executionIdentity, - OpExecInitInfo((idx, workers) => - new ProgressiveSinkOpExec( - outputMode, - storageKey, - workflowIdentity - ) - ) + OpExecSink(storageKey, workflowIdentity, outputMode) ) .withInputPorts(List(InputPort(PortIdentity(internal = true)))) .withOutputPorts(List(OutputPort(PortIdentity(internal = true)))) @@ -90,7 +81,7 @@ object SpecialPhysicalOpFactory { PhysicalOpIdentity(opId, s"source${portId.id}"), workflowIdentity, executionIdentity, - OpExecInitInfo((_, _) => new CacheSourceOpExec(storageKey, workflowIdentity)) + OpExecSource(storageKey, workflowIdentity) ) .withInputPorts(List.empty) .withOutputPorts(List(outputPort)) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/TestOperators.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/TestOperators.scala index bf03e272577..7f428ab7967 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/TestOperators.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/TestOperators.scala @@ -67,7 +67,7 @@ object TestOperators { csvHeaderlessOp.fileName = Some(fileName) csvHeaderlessOp.customDelimiter = Some(",") csvHeaderlessOp.hasHeader = header - csvHeaderlessOp.setFileUri(FileResolver.resolve(fileName)) + csvHeaderlessOp.setResolvedFileName(FileResolver.resolve(fileName)) csvHeaderlessOp } @@ -76,7 +76,7 @@ object TestOperators { val jsonlOp = new JSONLScanSourceOpDesc jsonlOp.fileName = Some(fileName) jsonlOp.flatten = flatten - jsonlOp.setFileUri(FileResolver.resolve(fileName)) + jsonlOp.setResolvedFileName(FileResolver.resolve(fileName)) jsonlOp } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/aggregate/AggregateOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/aggregate/AggregateOpDesc.scala index b27c9cac387..14c138562f4 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/aggregate/AggregateOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/aggregate/AggregateOpDesc.scala @@ -2,23 +2,18 @@ package edu.uci.ics.amber.operator.aggregate import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle -import edu.uci.ics.amber.core.executor.OpExecInitInfo +import edu.uci.ics.amber.core.executor.OpExecWithClassName import edu.uci.ics.amber.core.tuple.Schema -import edu.uci.ics.amber.core.workflow.{ - HashPartition, - PhysicalOp, - PhysicalPlan, - SchemaPropagationFunc -} -import edu.uci.ics.amber.operator.LogicalOp -import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeNameList -import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.core.virtualidentity.{ ExecutionIdentity, PhysicalOpIdentity, WorkflowIdentity } -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PhysicalLink, PortIdentity} +import edu.uci.ics.amber.core.workflow._ +import edu.uci.ics.amber.operator.LogicalOp +import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeNameList +import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} +import edu.uci.ics.amber.util.JSONUtils.objectMapper import javax.validation.constraints.{NotNull, Size} @@ -42,49 +37,52 @@ class AggregateOpDesc extends LogicalOp { // TODO: this is supposed to be blocking but due to limitations of materialization naming on the logical operator // we are keeping it not annotated as blocking. + val inputPort = InputPort(PortIdentity()) val outputPort = OutputPort(PortIdentity(internal = true)) - val partialPhysicalOp = - PhysicalOp - .oneToOnePhysicalOp( - PhysicalOpIdentity(operatorIdentifier, "localAgg"), - workflowId, - executionId, - OpExecInitInfo((_, _) => new AggregateOpExec(aggregations, groupByKeys)) - ) - .withIsOneToManyOp(true) - .withInputPorts(List(InputPort(PortIdentity()))) - .withOutputPorts(List(outputPort)) - .withPropagateSchema( - SchemaPropagationFunc(inputSchemas => - Map( - PortIdentity(internal = true) -> getOutputSchema( - operatorInfo.inputPorts.map(port => inputSchemas(port.id)).toArray - ) + val partialDesc = objectMapper.writeValueAsString(this) + val localAggregations = List(aggregations: _*) + val partialPhysicalOp = PhysicalOp + .oneToOnePhysicalOp( + PhysicalOpIdentity(operatorIdentifier, "localAgg"), + workflowId, + executionId, + OpExecWithClassName("edu.uci.ics.amber.operator.aggregate.AggregateOpExec", partialDesc) + ) + .withIsOneToManyOp(true) + .withInputPorts(List(inputPort)) + .withOutputPorts(List(outputPort)) + .withPropagateSchema( + SchemaPropagationFunc(inputSchemas => { + aggregations = localAggregations + Map( + PortIdentity(internal = true) -> getOutputSchema( + operatorInfo.inputPorts.map(port => inputSchemas(port.id)).toArray ) ) - ) - - val inputPort = InputPort(PortIdentity(0, internal = true)) + }) + ) + val finalInputPort = InputPort(PortIdentity(0, internal = true)) val finalOutputPort = OutputPort(PortIdentity(0), blocking = true) + // change aggregations to final + aggregations = aggregations.map(aggr => aggr.getFinal) + val finalDesc = objectMapper.writeValueAsString(this) val finalPhysicalOp = PhysicalOp .oneToOnePhysicalOp( PhysicalOpIdentity(operatorIdentifier, "globalAgg"), workflowId, executionId, - OpExecInitInfo((_, _) => - new AggregateOpExec(aggregations.map(aggr => aggr.getFinal), groupByKeys) - ) + OpExecWithClassName("edu.uci.ics.amber.operator.aggregate.AggregateOpExec", finalDesc) ) .withParallelizable(false) .withIsOneToManyOp(true) - .withInputPorts(List(inputPort)) + .withInputPorts(List(finalInputPort)) .withOutputPorts(List(finalOutputPort)) .withPropagateSchema( SchemaPropagationFunc(inputSchemas => Map(operatorInfo.outputPorts.head.id -> { - inputSchemas(PortIdentity(internal = true)) + inputSchemas(finalInputPort.id) }) ) ) @@ -94,7 +92,7 @@ class AggregateOpDesc extends LogicalOp { var plan = PhysicalPlan( operators = Set(partialPhysicalOp, finalPhysicalOp), links = Set( - PhysicalLink(partialPhysicalOp.id, outputPort.id, finalPhysicalOp.id, inputPort.id) + PhysicalLink(partialPhysicalOp.id, outputPort.id, finalPhysicalOp.id, finalInputPort.id) ) ) plan.operators.foreach(op => plan = plan.setOperator(op.withIsOneToManyOp(true))) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/aggregate/AggregateOpExec.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/aggregate/AggregateOpExec.scala index d3b3d82022e..147803c6fca 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/aggregate/AggregateOpExec.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/aggregate/AggregateOpExec.scala @@ -2,20 +2,15 @@ package edu.uci.ics.amber.operator.aggregate import edu.uci.ics.amber.core.executor.OperatorExecutor import edu.uci.ics.amber.core.tuple.{Tuple, TupleLike} +import edu.uci.ics.amber.util.JSONUtils.objectMapper import scala.collection.mutable /** * AggregateOpExec performs aggregation operations on input tuples, optionally grouping them by specified keys. - * - * @param aggregations a list of aggregation operations to apply on the tuples - * @param groupByKeys a list of attribute names to group the tuples by */ -class AggregateOpExec( - aggregations: List[AggregationOperation], - groupByKeys: List[String] -) extends OperatorExecutor { - +class AggregateOpExec(descString: String) extends OperatorExecutor { + private val desc: AggregateOpDesc = objectMapper.readValue(descString, classOf[AggregateOpDesc]) private val keyedPartialAggregates = new mutable.HashMap[List[Object], List[Object]]() private var distributedAggregations: List[DistributedAggregation[Object]] = _ @@ -23,12 +18,13 @@ class AggregateOpExec( // Initialize distributedAggregations if it's not yet initialized if (distributedAggregations == null) { - distributedAggregations = - aggregations.map(agg => agg.getAggFunc(tuple.getSchema.getAttribute(agg.attribute).getType)) + distributedAggregations = desc.aggregations.map(agg => + agg.getAggFunc(tuple.getSchema.getAttribute(agg.attribute).getType) + ) } // Construct the group key - val key = groupByKeys.map(tuple.getField[Object]) + val key = desc.groupByKeys.map(tuple.getField[Object]) // Get or initialize the partial aggregate for the key val partialAggregates = diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/cartesianProduct/CartesianProductOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/cartesianProduct/CartesianProductOpDesc.scala index 2be0c18e598..7e71d29b42b 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/cartesianProduct/CartesianProductOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/cartesianProduct/CartesianProductOpDesc.scala @@ -1,12 +1,11 @@ package edu.uci.ics.amber.operator.cartesianProduct -import edu.uci.ics.amber.core.executor.OpExecInitInfo +import edu.uci.ics.amber.core.executor.OpExecWithClassName import edu.uci.ics.amber.core.tuple.{Attribute, Schema} -import edu.uci.ics.amber.core.workflow.{PhysicalOp, SchemaPropagationFunc} +import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} +import edu.uci.ics.amber.core.workflow._ import edu.uci.ics.amber.operator.LogicalOp import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} -import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} class CartesianProductOpDesc extends LogicalOp { override def getPhysicalOp( @@ -18,7 +17,7 @@ class CartesianProductOpDesc extends LogicalOp { workflowId, executionId, operatorIdentifier, - OpExecInitInfo((_, _) => new CartesianProductOpExec()) + OpExecWithClassName("edu.uci.ics.amber.operator.cartesianProduct.CartesianProductOpExec") ) .withInputPorts(operatorInfo.inputPorts) .withOutputPorts(operatorInfo.outputPorts) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/dictionary/DictionaryMatcherOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/dictionary/DictionaryMatcherOpDesc.scala index 6921cccfba7..4a2cb463355 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/dictionary/DictionaryMatcherOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/dictionary/DictionaryMatcherOpDesc.scala @@ -2,12 +2,13 @@ package edu.uci.ics.amber.operator.dictionary import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.google.common.base.Preconditions -import edu.uci.ics.amber.core.executor.OpExecInitInfo +import edu.uci.ics.amber.core.executor.OpExecWithClassName import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} import edu.uci.ics.amber.core.workflow.{PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.operator.map.MapOpDesc import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} +import edu.uci.ics.amber.util.JSONUtils.objectMapper import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} @@ -39,7 +40,10 @@ class DictionaryMatcherOpDesc extends MapOpDesc { workflowId, executionId, operatorIdentifier, - OpExecInitInfo((_, _) => new DictionaryMatcherOpExec(attribute, dictionary, matchingType)) + OpExecWithClassName( + "edu.uci.ics.amber.operator.dictionary.DictionaryMatcherOpExec", + objectMapper.writeValueAsString(this) + ) ) .withInputPorts(operatorInfo.inputPorts) .withOutputPorts(operatorInfo.outputPorts) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/dictionary/DictionaryMatcherOpExec.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/dictionary/DictionaryMatcherOpExec.scala index 8233e8087a7..7a811537d2d 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/dictionary/DictionaryMatcherOpExec.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/dictionary/DictionaryMatcherOpExec.scala @@ -2,19 +2,20 @@ package edu.uci.ics.amber.operator.dictionary import edu.uci.ics.amber.core.tuple.{Tuple, TupleLike} import edu.uci.ics.amber.operator.map.MapOpExec +import edu.uci.ics.amber.util.JSONUtils.objectMapper import org.apache.lucene.analysis.Analyzer import org.apache.lucene.analysis.en.EnglishAnalyzer import org.apache.lucene.analysis.tokenattributes.CharTermAttribute + import java.io.StringReader import scala.collection.mutable import scala.collection.mutable.ListBuffer class DictionaryMatcherOpExec( - attributeName: String, - dictionary: String, - matchingType: MatchingType + descString: String ) extends MapOpExec { - + private val desc: DictionaryMatcherOpDesc = + objectMapper.readValue(descString, classOf[DictionaryMatcherOpDesc]) // this is needed for the matching types Phrase and Conjunction var tokenizedDictionaryEntries: ListBuffer[mutable.Set[String]] = _ // this is needed for the simple Scan matching type @@ -40,8 +41,8 @@ class DictionaryMatcherOpExec( */ override def open(): Unit = { // create the dictionary by splitting the values first - dictionaryEntries = dictionary.split(",").toList.map(_.toLowerCase) - if (matchingType == MatchingType.CONJUNCTION_INDEXBASED) { + dictionaryEntries = desc.dictionary.split(",").toList.map(_.toLowerCase) + if (desc.matchingType == MatchingType.CONJUNCTION_INDEXBASED) { // then tokenize each entry this.luceneAnalyzer = new EnglishAnalyzer tokenizedDictionaryEntries = ListBuffer[mutable.Set[String]]() @@ -72,12 +73,12 @@ class DictionaryMatcherOpExec( * @return true if the tuple matches a dictionary entry according to the matching criteria; false otherwise. */ private def isTupleInDictionary(tuple: Tuple): Boolean = { - val text = tuple.getField(attributeName).asInstanceOf[String].toLowerCase + val text = tuple.getField(desc.attribute).asInstanceOf[String].toLowerCase // Return false if the text is empty, as it cannot match any dictionary entry if (text.isEmpty) return false - matchingType match { + desc.matchingType match { case MatchingType.SCANBASED => // Directly check if the dictionary contains the text dictionaryEntries.contains(text) @@ -130,7 +131,7 @@ class DictionaryMatcherOpExec( */ private def labelTupleIfMatched(tuple: Tuple): TupleLike = { val isMatched = - Option(tuple.getField[Any](attributeName)).exists(_ => isTupleInDictionary(tuple)) + Option(tuple.getField[Any](desc.attribute)).exists(_ => isTupleInDictionary(tuple)) TupleLike(tuple.getFields ++ Seq(isMatched)) } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/difference/DifferenceOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/difference/DifferenceOpDesc.scala index a8c25ad2363..8c144b3756a 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/difference/DifferenceOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/difference/DifferenceOpDesc.scala @@ -1,7 +1,7 @@ package edu.uci.ics.amber.operator.difference import com.google.common.base.Preconditions -import edu.uci.ics.amber.core.executor.OpExecInitInfo +import edu.uci.ics.amber.core.executor.OpExecWithClassName import edu.uci.ics.amber.core.tuple.Schema import edu.uci.ics.amber.core.workflow.{HashPartition, PhysicalOp} import edu.uci.ics.amber.operator.LogicalOp @@ -20,7 +20,7 @@ class DifferenceOpDesc extends LogicalOp { workflowId, executionId, operatorIdentifier, - OpExecInitInfo((_, _) => new DifferenceOpExec()) + OpExecWithClassName("edu.uci.ics.amber.operator.difference.DifferenceOpExec") ) .withInputPorts(operatorInfo.inputPorts) .withOutputPorts(operatorInfo.outputPorts) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/distinct/DistinctOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/distinct/DistinctOpDesc.scala index ae00eb38c10..30c2f9f4b27 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/distinct/DistinctOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/distinct/DistinctOpDesc.scala @@ -1,7 +1,7 @@ package edu.uci.ics.amber.operator.distinct import com.google.common.base.Preconditions -import edu.uci.ics.amber.core.executor.OpExecInitInfo +import edu.uci.ics.amber.core.executor.OpExecWithClassName import edu.uci.ics.amber.core.tuple.Schema import edu.uci.ics.amber.core.workflow.{HashPartition, PhysicalOp} import edu.uci.ics.amber.operator.LogicalOp @@ -20,7 +20,7 @@ class DistinctOpDesc extends LogicalOp { workflowId, executionId, operatorIdentifier, - OpExecInitInfo((_, _) => new DistinctOpExec()) + OpExecWithClassName("edu.uci.ics.amber.operator.distinct.DistinctOpExec") ) .withInputPorts(operatorInfo.inputPorts) .withOutputPorts(operatorInfo.outputPorts) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/filter/SpecializedFilterOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/filter/SpecializedFilterOpDesc.scala index 61b87009377..340e2ee8b48 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/filter/SpecializedFilterOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/filter/SpecializedFilterOpDesc.scala @@ -1,9 +1,10 @@ package edu.uci.ics.amber.operator.filter import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} -import edu.uci.ics.amber.core.executor.OpExecInitInfo +import edu.uci.ics.amber.core.executor.OpExecWithClassName import edu.uci.ics.amber.core.workflow.PhysicalOp import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} +import edu.uci.ics.amber.util.JSONUtils.objectMapper import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} @@ -22,7 +23,10 @@ class SpecializedFilterOpDesc extends FilterOpDesc { workflowId, executionId, operatorIdentifier, - OpExecInitInfo((_, _) => new SpecializedFilterOpExec(predicates)) + OpExecWithClassName( + "edu.uci.ics.amber.operator.filter.SpecializedFilterOpExec", + objectMapper.writeValueAsString(this) + ) ) .withInputPorts(operatorInfo.inputPorts) .withOutputPorts(operatorInfo.outputPorts) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/filter/SpecializedFilterOpExec.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/filter/SpecializedFilterOpExec.scala index 096721decac..88b83282153 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/filter/SpecializedFilterOpExec.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/filter/SpecializedFilterOpExec.scala @@ -1,8 +1,10 @@ package edu.uci.ics.amber.operator.filter import edu.uci.ics.amber.core.tuple.Tuple +import edu.uci.ics.amber.util.JSONUtils.objectMapper -class SpecializedFilterOpExec(predicates: List[FilterPredicate]) extends FilterOpExec { - - setFilterFunc((tuple: Tuple) => predicates.exists(_.evaluate(tuple))) +class SpecializedFilterOpExec(descString: String) extends FilterOpExec { + private val desc: SpecializedFilterOpDesc = + objectMapper.readValue(descString, classOf[SpecializedFilterOpDesc]) + setFilterFunc((tuple: Tuple) => desc.predicates.exists(_.evaluate(tuple))) } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/hashJoin/HashJoinBuildOpExec.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/hashJoin/HashJoinBuildOpExec.scala index 08633de0c62..ced8d06de0f 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/hashJoin/HashJoinBuildOpExec.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/hashJoin/HashJoinBuildOpExec.scala @@ -2,17 +2,19 @@ package edu.uci.ics.amber.operator.hashJoin import edu.uci.ics.amber.core.executor.OperatorExecutor import edu.uci.ics.amber.core.tuple.{Tuple, TupleLike} +import edu.uci.ics.amber.util.JSONUtils.objectMapper import scala.collection.mutable import scala.collection.mutable.ListBuffer -class HashJoinBuildOpExec[K](buildAttributeName: String) extends OperatorExecutor { - +class HashJoinBuildOpExec[K](descString: String) extends OperatorExecutor { + private val desc: HashJoinOpDesc[K] = + objectMapper.readValue(descString, classOf[HashJoinOpDesc[K]]) var buildTableHashMap: mutable.HashMap[K, ListBuffer[Tuple]] = _ override def processTuple(tuple: Tuple, port: Int): Iterator[TupleLike] = { - val key = tuple.getField(buildAttributeName).asInstanceOf[K] + val key = tuple.getField(desc.buildAttributeName).asInstanceOf[K] buildTableHashMap.getOrElseUpdate(key, new ListBuffer[Tuple]()) += tuple Iterator() } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/hashJoin/HashJoinOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/hashJoin/HashJoinOpDesc.scala index fe009a91989..3777b2b6216 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/hashJoin/HashJoinOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/hashJoin/HashJoinOpDesc.scala @@ -2,7 +2,7 @@ package edu.uci.ics.amber.operator.hashJoin import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.{JsonSchemaInject, JsonSchemaTitle} -import edu.uci.ics.amber.core.executor.OpExecInitInfo +import edu.uci.ics.amber.core.executor.OpExecWithClassName import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} import edu.uci.ics.amber.core.workflow._ import edu.uci.ics.amber.operator.LogicalOp @@ -18,6 +18,7 @@ import edu.uci.ics.amber.operator.metadata.annotations.{ AutofillAttributeNameOnPort1 } import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} +import edu.uci.ics.amber.util.JSONUtils.objectMapper object HashJoinOpDesc { val HASH_JOIN_INTERNAL_KEY_NAME = "__internal__hashtable__key__" @@ -67,7 +68,10 @@ class HashJoinOpDesc[K] extends LogicalOp { PhysicalOpIdentity(operatorIdentifier, "build"), workflowId, executionId, - OpExecInitInfo((_, _) => new HashJoinBuildOpExec[K](buildAttributeName)) + OpExecWithClassName( + "edu.uci.ics.amber.operator.hashJoin.HashJoinBuildOpExec", + objectMapper.writeValueAsString(this) + ) ) .withInputPorts(List(buildInputPort)) .withOutputPorts(List(buildOutputPort)) @@ -96,11 +100,9 @@ class HashJoinOpDesc[K] extends LogicalOp { PhysicalOpIdentity(operatorIdentifier, "probe"), workflowId, executionId, - OpExecInitInfo((_, _) => - new HashJoinProbeOpExec[K]( - probeAttributeName, - joinType - ) + OpExecWithClassName( + "edu.uci.ics.amber.operator.hashJoin.HashJoinProbeOpExec", + objectMapper.writeValueAsString(this) ) ) .withInputPorts( diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/hashJoin/HashJoinProbeOpExec.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/hashJoin/HashJoinProbeOpExec.scala index 38d46367a4f..4483c09dc25 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/hashJoin/HashJoinProbeOpExec.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/hashJoin/HashJoinProbeOpExec.scala @@ -3,6 +3,7 @@ package edu.uci.ics.amber.operator.hashJoin import edu.uci.ics.amber.core.executor.OperatorExecutor import edu.uci.ics.amber.core.tuple.{Tuple, TupleLike} import edu.uci.ics.amber.operator.hashJoin.HashJoinOpDesc.HASH_JOIN_INTERNAL_KEY_NAME +import edu.uci.ics.amber.util.JSONUtils.objectMapper import scala.collection.mutable import scala.collection.mutable.ListBuffer @@ -41,11 +42,11 @@ object JoinUtils { } class HashJoinProbeOpExec[K]( - probeAttributeName: String, - joinType: JoinType + descString: String ) extends OperatorExecutor { - var currentTuple: Tuple = _ + private val desc: HashJoinOpDesc[K] = + objectMapper.readValue(descString, classOf[HashJoinOpDesc[K]]) var buildTableHashMap: mutable.HashMap[K, (ListBuffer[Tuple], Boolean)] = _ override def processTuple(tuple: Tuple, port: Int): Iterator[TupleLike] = @@ -59,7 +60,7 @@ class HashJoinProbeOpExec[K]( Iterator.empty } else { // Probe phase - val key = tuple.getField(probeAttributeName).asInstanceOf[K] + val key = tuple.getField(desc.probeAttributeName).asInstanceOf[K] val (matchedTuples, joined) = buildTableHashMap.getOrElse(key, (new ListBuffer[Tuple](), false)) @@ -67,7 +68,7 @@ class HashJoinProbeOpExec[K]( // Join match found buildTableHashMap.put(key, (matchedTuples, true)) performJoin(tuple, matchedTuples) - } else if (joinType == JoinType.RIGHT_OUTER || joinType == JoinType.FULL_OUTER) { + } else if (desc.joinType == JoinType.RIGHT_OUTER || desc.joinType == JoinType.FULL_OUTER) { // Handle right and full outer joins without a match performRightAntiJoin(tuple) } else { @@ -77,7 +78,9 @@ class HashJoinProbeOpExec[K]( } override def onFinish(port: Int): Iterator[TupleLike] = { - if (port == 1 && (joinType == JoinType.LEFT_OUTER || joinType == JoinType.FULL_OUTER)) { + if ( + port == 1 && (desc.joinType == JoinType.LEFT_OUTER || desc.joinType == JoinType.FULL_OUTER) + ) { // Handle left and full outer joins after input is exhausted performLeftAntiJoin } else { @@ -104,7 +107,11 @@ class HashJoinProbeOpExec[K]( matchedTuples: ListBuffer[Tuple] ): Iterator[TupleLike] = { matchedTuples.iterator.map { buildTuple => - JoinUtils.joinTuples(buildTuple, probeTuple, skipAttributeName = Some(probeAttributeName)) + JoinUtils.joinTuples( + buildTuple, + probeTuple, + skipAttributeName = Some(desc.probeAttributeName) + ) } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/intersect/IntersectOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/intersect/IntersectOpDesc.scala index 8fc2e999ee7..1de8534ac11 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/intersect/IntersectOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/intersect/IntersectOpDesc.scala @@ -1,7 +1,7 @@ package edu.uci.ics.amber.operator.intersect import com.google.common.base.Preconditions -import edu.uci.ics.amber.core.executor.OpExecInitInfo +import edu.uci.ics.amber.core.executor.OpExecWithClassName import edu.uci.ics.amber.core.tuple.Schema import edu.uci.ics.amber.core.workflow.{HashPartition, PhysicalOp} import edu.uci.ics.amber.operator.LogicalOp @@ -20,7 +20,7 @@ class IntersectOpDesc extends LogicalOp { workflowId, executionId, operatorIdentifier, - OpExecInitInfo((_, _) => new IntersectOpExec()) + OpExecWithClassName("edu.uci.ics.amber.operator.intersect.IntersectOpExec") ) .withInputPorts(operatorInfo.inputPorts) .withOutputPorts(operatorInfo.outputPorts) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/intervalJoin/IntervalJoinOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/intervalJoin/IntervalJoinOpDesc.scala index d27792c044d..985b8f9c4d6 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/intervalJoin/IntervalJoinOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/intervalJoin/IntervalJoinOpDesc.scala @@ -3,7 +3,7 @@ package edu.uci.ics.amber.operator.intervalJoin import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.fasterxml.jackson.databind.annotation.JsonDeserialize import com.kjetland.jackson.jsonSchema.annotations.{JsonSchemaInject, JsonSchemaTitle} -import edu.uci.ics.amber.core.executor.OpExecInitInfo +import edu.uci.ics.amber.core.executor.OpExecWithClassName import edu.uci.ics.amber.core.tuple.{Attribute, Schema} import edu.uci.ics.amber.core.workflow.{HashPartition, PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.operator.LogicalOp @@ -12,6 +12,7 @@ import edu.uci.ics.amber.operator.metadata.annotations.{ AutofillAttributeName, AutofillAttributeNameOnPort1 } +import edu.uci.ics.amber.util.JSONUtils.objectMapper import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} @@ -82,15 +83,9 @@ class IntervalJoinOpDesc extends LogicalOp { workflowId, executionId, operatorIdentifier, - OpExecInitInfo((_, _) => - new IntervalJoinOpExec( - leftAttributeName, - rightAttributeName, - includeLeftBound, - includeRightBound, - constant, - timeIntervalType - ) + OpExecWithClassName( + "edu.uci.ics.amber.operator.intervalJoin.IntervalJoinOpExec", + objectMapper.writeValueAsString(this) ) ) .withInputPorts(operatorInfo.inputPorts) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/intervalJoin/IntervalJoinOpExec.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/intervalJoin/IntervalJoinOpExec.scala index 2999ebc482a..3abc4fce00b 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/intervalJoin/IntervalJoinOpExec.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/intervalJoin/IntervalJoinOpExec.scala @@ -4,6 +4,7 @@ import edu.uci.ics.amber.core.WorkflowRuntimeException import edu.uci.ics.amber.core.executor.OperatorExecutor import edu.uci.ics.amber.core.tuple.{AttributeType, Tuple, TupleLike} import edu.uci.ics.amber.operator.hashJoin.JoinUtils +import edu.uci.ics.amber.util.JSONUtils.objectMapper import java.sql.Timestamp import scala.collection.mutable.ListBuffer @@ -13,14 +14,10 @@ import scala.collection.mutable.ListBuffer * 2. The left input join key takes as points, join condition is: left key in the range of (right key, right key + constant) */ class IntervalJoinOpExec( - leftAttributeName: String, - rightAttributeName: String, - includeLeftBound: Boolean, - includeRightBound: Boolean, - constant: Long, - timeIntervalType: Option[TimeIntervalType] + descString: String ) extends OperatorExecutor { - + private val desc: IntervalJoinOpDesc = + objectMapper.readValue(descString, classOf[IntervalJoinOpDesc]) var leftTable: ListBuffer[Tuple] = new ListBuffer[Tuple]() var rightTable: ListBuffer[Tuple] = new ListBuffer[Tuple]() @@ -34,10 +31,10 @@ class IntervalJoinOpExec( .filter(rightTableTuple => { intervalCompare( - tuple.getField(leftAttributeName), - rightTableTuple.getField(rightAttributeName), + tuple.getField(desc.leftAttributeName), + rightTableTuple.getField(desc.rightAttributeName), rightTableTuple.getSchema - .getAttribute(rightAttributeName) + .getAttribute(desc.rightAttributeName) .getType ) == 0 }) @@ -53,10 +50,10 @@ class IntervalJoinOpExec( leftTable .filter(leftTableTuple => { intervalCompare( - leftTableTuple.getField(leftAttributeName), - tuple.getField(rightAttributeName), + leftTableTuple.getField(desc.leftAttributeName), + tuple.getField(desc.rightAttributeName), leftTableTuple.getSchema - .getAttribute(leftAttributeName) + .getAttribute(desc.leftAttributeName) .getType ) == 0 }) @@ -74,10 +71,10 @@ class IntervalJoinOpExec( while (rightTable.nonEmpty) { if ( intervalCompare( - leftTableSmallestTuple.getField(leftAttributeName), - rightTable.head.getField(rightAttributeName), + leftTableSmallestTuple.getField(desc.leftAttributeName), + rightTable.head.getField(desc.rightAttributeName), leftTableSmallestTuple.getSchema - .getAttribute(leftAttributeName) + .getAttribute(desc.leftAttributeName) .getType ) > 0 ) { @@ -94,10 +91,10 @@ class IntervalJoinOpExec( while (leftTable.nonEmpty) { if ( intervalCompare( - leftTable.head.getField(leftAttributeName), - rightTableSmallestTuple.getField(rightAttributeName), + leftTable.head.getField(desc.leftAttributeName), + rightTableSmallestTuple.getField(desc.rightAttributeName), rightTableSmallestTuple.getSchema - .getAttribute(rightAttributeName) + .getAttribute(desc.rightAttributeName) .getType ) < 0 ) { @@ -114,15 +111,15 @@ class IntervalJoinOpExec( leftBoundValue: T, rightBoundValue: T )(implicit ev$1: T => Ordered[T]): Int = { - if (includeLeftBound && includeRightBound) { + if (desc.includeLeftBound && desc.includeRightBound) { if (pointValue >= leftBoundValue && pointValue <= rightBoundValue) 0 else if (pointValue < leftBoundValue) -1 else 1 - } else if (includeLeftBound && !includeRightBound) { + } else if (desc.includeLeftBound && !desc.includeRightBound) { if (pointValue >= leftBoundValue && pointValue < rightBoundValue) 0 else if (pointValue < leftBoundValue) -1 else 1 - } else if (!includeLeftBound && includeRightBound) { + } else if (!desc.includeLeftBound && desc.includeRightBound) { if (pointValue > leftBoundValue && pointValue <= rightBoundValue) 0 else if (pointValue <= leftBoundValue) -1 else 1 @@ -142,7 +139,7 @@ class IntervalJoinOpExec( if (dataType == AttributeType.LONG) { val pointValue: Long = point.asInstanceOf[Long] val leftBoundValue: Long = leftBound.asInstanceOf[Long] - val constantValue: Long = constant + val constantValue: Long = desc.constant val rightBoundValue: Long = leftBoundValue + constantValue result = processNumValue[Long]( pointValue, @@ -153,7 +150,7 @@ class IntervalJoinOpExec( } else if (dataType == AttributeType.DOUBLE) { val pointValue: Double = point.asInstanceOf[Double] val leftBoundValue: Double = leftBound.asInstanceOf[Double] - val constantValue: Double = constant.asInstanceOf[Double] + val constantValue: Double = desc.constant.asInstanceOf[Double] val rightBoundValue: Double = leftBoundValue + constantValue result = processNumValue[Double]( pointValue, @@ -163,7 +160,7 @@ class IntervalJoinOpExec( } else if (dataType == AttributeType.INTEGER) { val pointValue: Int = point.asInstanceOf[Int] val leftBoundValue: Int = leftBound.asInstanceOf[Int] - val constantValue: Int = constant.asInstanceOf[Int] + val constantValue: Int = desc.constant.asInstanceOf[Int] val rightBoundValue: Int = leftBoundValue + constantValue result = processNumValue[Int]( pointValue, @@ -174,21 +171,21 @@ class IntervalJoinOpExec( val pointValue: Timestamp = point.asInstanceOf[Timestamp] val leftBoundValue: Timestamp = leftBound.asInstanceOf[Timestamp] val rightBoundValue: Timestamp = - timeIntervalType match { + desc.timeIntervalType match { case Some(TimeIntervalType.YEAR) => - Timestamp.valueOf(leftBoundValue.toLocalDateTime.plusYears(constant)) + Timestamp.valueOf(leftBoundValue.toLocalDateTime.plusYears(desc.constant)) case Some(TimeIntervalType.MONTH) => - Timestamp.valueOf(leftBoundValue.toLocalDateTime.plusMonths(constant)) + Timestamp.valueOf(leftBoundValue.toLocalDateTime.plusMonths(desc.constant)) case Some(TimeIntervalType.DAY) => - Timestamp.valueOf(leftBoundValue.toLocalDateTime.plusDays(constant)) + Timestamp.valueOf(leftBoundValue.toLocalDateTime.plusDays(desc.constant)) case Some(TimeIntervalType.HOUR) => - Timestamp.valueOf(leftBoundValue.toLocalDateTime.plusHours(constant)) + Timestamp.valueOf(leftBoundValue.toLocalDateTime.plusHours(desc.constant)) case Some(TimeIntervalType.MINUTE) => - Timestamp.valueOf(leftBoundValue.toLocalDateTime.plusMinutes(constant)) + Timestamp.valueOf(leftBoundValue.toLocalDateTime.plusMinutes(desc.constant)) case Some(TimeIntervalType.SECOND) => - Timestamp.valueOf(leftBoundValue.toLocalDateTime.plusSeconds(constant)) + Timestamp.valueOf(leftBoundValue.toLocalDateTime.plusSeconds(desc.constant)) case None => - Timestamp.valueOf(leftBoundValue.toLocalDateTime.plusDays(constant)) + Timestamp.valueOf(leftBoundValue.toLocalDateTime.plusDays(desc.constant)) } result = processNumValue( pointValue.getTime, diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/keywordSearch/KeywordSearchOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/keywordSearch/KeywordSearchOpDesc.scala index cf478610d56..b3e41f267e6 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/keywordSearch/KeywordSearchOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/keywordSearch/KeywordSearchOpDesc.scala @@ -2,11 +2,12 @@ package edu.uci.ics.amber.operator.keywordSearch import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle -import edu.uci.ics.amber.core.executor.OpExecInitInfo +import edu.uci.ics.amber.core.executor.OpExecWithClassName import edu.uci.ics.amber.core.workflow.PhysicalOp import edu.uci.ics.amber.operator.filter.FilterOpDesc import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName +import edu.uci.ics.amber.util.JSONUtils.objectMapper import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} @@ -32,7 +33,10 @@ class KeywordSearchOpDesc extends FilterOpDesc { workflowId, executionId, operatorIdentifier, - OpExecInitInfo((_, _) => new KeywordSearchOpExec(attribute, keyword)) + OpExecWithClassName( + "edu.uci.ics.amber.operator.keywordSearch.KeywordSearchOpExec", + objectMapper.writeValueAsString(this) + ) ) .withInputPorts(operatorInfo.inputPorts) .withOutputPorts(operatorInfo.outputPorts) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/keywordSearch/KeywordSearchOpExec.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/keywordSearch/KeywordSearchOpExec.scala index 4275a682521..1154ed18a28 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/keywordSearch/KeywordSearchOpExec.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/keywordSearch/KeywordSearchOpExec.scala @@ -2,23 +2,27 @@ package edu.uci.ics.amber.operator.keywordSearch import edu.uci.ics.amber.core.tuple.Tuple import edu.uci.ics.amber.operator.filter.FilterOpExec +import edu.uci.ics.amber.util.JSONUtils.objectMapper import org.apache.lucene.analysis.standard.StandardAnalyzer import org.apache.lucene.index.memory.MemoryIndex import org.apache.lucene.queryparser.classic.QueryParser import org.apache.lucene.search.Query -class KeywordSearchOpExec(attributeName: String, keyword: String) extends FilterOpExec { +class KeywordSearchOpExec(descString: String) extends FilterOpExec { + private val desc: KeywordSearchOpDesc = + objectMapper.readValue(descString, classOf[KeywordSearchOpDesc]) + // We chose StandardAnalyzer because it provides more comprehensive tokenization, retaining numeric tokens and handling a broader range of characters. // This ensures that search functionality can include standalone numbers (e.g., "3") and complex queries while offering robust performance for most use cases. @transient private lazy val analyzer = new StandardAnalyzer() - @transient lazy val query: Query = new QueryParser(attributeName, analyzer).parse(keyword) + @transient lazy val query: Query = new QueryParser(desc.attribute, analyzer).parse(desc.keyword) @transient private lazy val memoryIndex: MemoryIndex = new MemoryIndex() this.setFilterFunc(findKeyword) private def findKeyword(tuple: Tuple): Boolean = { - Option[Any](tuple.getField(attributeName)).map(_.toString).exists { fieldValue => - memoryIndex.addField(attributeName, fieldValue, analyzer) + Option[Any](tuple.getField(desc.attribute)).map(_.toString).exists { fieldValue => + memoryIndex.addField(desc.attribute, fieldValue, analyzer) val isMatch = memoryIndex.search(query) > 0.0f memoryIndex.reset() isMatch diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/limit/LimitOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/limit/LimitOpDesc.scala index b3cf15a0e40..70ebe4725f4 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/limit/LimitOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/limit/LimitOpDesc.scala @@ -2,11 +2,12 @@ package edu.uci.ics.amber.operator.limit import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle -import edu.uci.ics.amber.core.executor.OpExecInitInfo +import edu.uci.ics.amber.core.executor.OpExecWithClassName import edu.uci.ics.amber.core.tuple.Schema import edu.uci.ics.amber.core.workflow.PhysicalOp import edu.uci.ics.amber.operator.{LogicalOp, StateTransferFunc} import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} +import edu.uci.ics.amber.util.JSONUtils.objectMapper import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} @@ -28,9 +29,10 @@ class LimitOpDesc extends LogicalOp { workflowId, executionId, operatorIdentifier, - OpExecInitInfo((_, _) => { - new LimitOpExec(limit) - }) + OpExecWithClassName( + "edu.uci.ics.amber.operator.limit.LimitOpExec", + objectMapper.writeValueAsString(this) + ) ) .withInputPorts(operatorInfo.inputPorts) .withOutputPorts(operatorInfo.outputPorts) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/limit/LimitOpExec.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/limit/LimitOpExec.scala index b3d74a81f56..f396f0acbee 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/limit/LimitOpExec.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/limit/LimitOpExec.scala @@ -2,13 +2,15 @@ package edu.uci.ics.amber.operator.limit import edu.uci.ics.amber.core.executor.OperatorExecutor import edu.uci.ics.amber.core.tuple.{Tuple, TupleLike} +import edu.uci.ics.amber.util.JSONUtils.objectMapper -class LimitOpExec(limit: Int) extends OperatorExecutor { +class LimitOpExec(descString: String) extends OperatorExecutor { + private val desc: LimitOpDesc = objectMapper.readValue(descString, classOf[LimitOpDesc]) var count = 0 override def processTuple(tuple: Tuple, port: Int): Iterator[TupleLike] = { - if (count < limit) { + if (count < desc.limit) { count += 1 Iterator(tuple) } else { diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/projection/ProjectionOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/projection/ProjectionOpDesc.scala index 2bd6fc413fd..47b80cfaef0 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/projection/ProjectionOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/projection/ProjectionOpDesc.scala @@ -3,12 +3,13 @@ package edu.uci.ics.amber.operator.projection import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.google.common.base.Preconditions import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle -import edu.uci.ics.amber.core.executor.OpExecInitInfo +import edu.uci.ics.amber.core.executor.OpExecWithClassName import edu.uci.ics.amber.core.tuple.{Attribute, Schema} import edu.uci.ics.amber.core.workflow.PhysicalOp.oneToOnePhysicalOp import edu.uci.ics.amber.core.workflow._ import edu.uci.ics.amber.operator.map.MapOpDesc import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} +import edu.uci.ics.amber.util.JSONUtils.objectMapper import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} @@ -29,7 +30,10 @@ class ProjectionOpDesc extends MapOpDesc { workflowId, executionId, operatorIdentifier, - OpExecInitInfo((_, _) => new ProjectionOpExec(attributes, isDrop)) + OpExecWithClassName( + "edu.uci.ics.amber.operator.projection.ProjectionOpExec", + objectMapper.writeValueAsString(this) + ) ) .withInputPorts(operatorInfo.inputPorts) .withOutputPorts(operatorInfo.outputPorts) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/projection/ProjectionOpExec.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/projection/ProjectionOpExec.scala index 75458594c8d..888c4b4c976 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/projection/ProjectionOpExec.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/projection/ProjectionOpExec.scala @@ -3,23 +3,24 @@ package edu.uci.ics.amber.operator.projection import com.google.common.base.Preconditions import edu.uci.ics.amber.core.tuple.{Tuple, TupleLike} import edu.uci.ics.amber.operator.map.MapOpExec +import edu.uci.ics.amber.util.JSONUtils.objectMapper import scala.collection.mutable class ProjectionOpExec( - attributeUnits: List[AttributeUnit], - dropOption: Boolean = false + descString: String ) extends MapOpExec { + val desc: ProjectionOpDesc = objectMapper.readValue(descString, classOf[ProjectionOpDesc]) setMapFunc(project) def project(tuple: Tuple): TupleLike = { - Preconditions.checkArgument(attributeUnits.nonEmpty) + Preconditions.checkArgument(desc.attributes.nonEmpty) var selectedUnits: List[AttributeUnit] = List() val fields = mutable.LinkedHashMap[String, Any]() - if (dropOption) { + if (desc.isDrop) { val allAttribute = tuple.schema.getAttributeNames - val selectedAttributes = attributeUnits.map(_.getOriginalAttribute) + val selectedAttributes = desc.attributes.map(_.getOriginalAttribute) val keepAttributes = allAttribute.diff(selectedAttributes) keepAttributes.foreach { attribute => @@ -31,7 +32,7 @@ class ProjectionOpExec( } else { - selectedUnits = attributeUnits + selectedUnits = desc.attributes } selectedUnits.foreach { attributeUnit => diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/randomksampling/RandomKSamplingOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/randomksampling/RandomKSamplingOpDesc.scala index 7fa187e70ee..3fd4849c27c 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/randomksampling/RandomKSamplingOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/randomksampling/RandomKSamplingOpDesc.scala @@ -1,14 +1,12 @@ package edu.uci.ics.amber.operator.randomksampling import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} -import edu.uci.ics.amber.core.executor.OpExecInitInfo -import edu.uci.ics.amber.core.workflow.PhysicalOp +import edu.uci.ics.amber.core.executor.OpExecWithClassName +import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} +import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PhysicalOp} import edu.uci.ics.amber.operator.filter.FilterOpDesc import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} -import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} - -import scala.util.Random +import edu.uci.ics.amber.util.JSONUtils.objectMapper class RandomKSamplingOpDesc extends FilterOpDesc { @@ -25,8 +23,9 @@ class RandomKSamplingOpDesc extends FilterOpDesc { workflowId, executionId, operatorIdentifier, - OpExecInitInfo((idx, workerCount) => - new RandomKSamplingOpExec(percentage, idx, Array.fill(workerCount)(Random.nextInt())) + OpExecWithClassName( + "edu.uci.ics.amber.operator.randomksampling.RandomKSamplingOpExec", + objectMapper.writeValueAsString(this) ) ) .withInputPorts(operatorInfo.inputPorts) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/randomksampling/RandomKSamplingOpExec.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/randomksampling/RandomKSamplingOpExec.scala index f6028b6c5c5..74767f46d00 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/randomksampling/RandomKSamplingOpExec.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/randomksampling/RandomKSamplingOpExec.scala @@ -1,11 +1,14 @@ package edu.uci.ics.amber.operator.randomksampling import edu.uci.ics.amber.operator.filter.FilterOpExec +import edu.uci.ics.amber.util.JSONUtils.objectMapper import scala.util.Random -class RandomKSamplingOpExec(percentage: Int, worker: Int, seedFunc: Int => Int) - extends FilterOpExec { - val rand: Random = new Random(seedFunc(worker)) - setFilterFunc(_ => (percentage / 100.0) >= rand.nextDouble()) +class RandomKSamplingOpExec(descString: String, idx: Int, workerCount: Int) extends FilterOpExec { + private val desc: RandomKSamplingOpDesc = + objectMapper.readValue(descString, classOf[RandomKSamplingOpDesc]) + + val rand: Random = new Random(workerCount) + setFilterFunc(_ => (desc.percentage / 100.0) >= rand.nextDouble()) } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/regex/RegexOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/regex/RegexOpDesc.scala index 6d06c839943..070417540fb 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/regex/RegexOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/regex/RegexOpDesc.scala @@ -2,11 +2,12 @@ package edu.uci.ics.amber.operator.regex import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle -import edu.uci.ics.amber.core.executor.OpExecInitInfo +import edu.uci.ics.amber.core.executor.OpExecWithClassName import edu.uci.ics.amber.core.workflow.PhysicalOp import edu.uci.ics.amber.operator.filter.FilterOpDesc import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName +import edu.uci.ics.amber.util.JSONUtils.objectMapper import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} @@ -35,7 +36,10 @@ class RegexOpDesc extends FilterOpDesc { workflowId, executionId, operatorIdentifier, - OpExecInitInfo((_, _) => new RegexOpExec(regex, caseInsensitive, attribute)) + OpExecWithClassName( + "edu.uci.ics.amber.operator.regex.RegexOpExec", + objectMapper.writeValueAsString(this) + ) ) .withInputPorts(operatorInfo.inputPorts) .withOutputPorts(operatorInfo.outputPorts) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/regex/RegexOpExec.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/regex/RegexOpExec.scala index 49a5aafbcfa..0c2b72a402b 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/regex/RegexOpExec.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/regex/RegexOpExec.scala @@ -2,17 +2,18 @@ package edu.uci.ics.amber.operator.regex import edu.uci.ics.amber.core.tuple.Tuple import edu.uci.ics.amber.operator.filter.FilterOpExec +import edu.uci.ics.amber.util.JSONUtils.objectMapper import java.util.regex.Pattern -class RegexOpExec(regex: String, caseInsensitive: Boolean, attributeName: String) - extends FilterOpExec { +class RegexOpExec(descString: String) extends FilterOpExec { + private val desc: RegexOpDesc = objectMapper.readValue(descString, classOf[RegexOpDesc]) lazy val pattern: Pattern = - Pattern.compile(regex, if (caseInsensitive) Pattern.CASE_INSENSITIVE else 0) + Pattern.compile(desc.regex, if (desc.caseInsensitive) Pattern.CASE_INSENSITIVE else 0) this.setFilterFunc(this.matchRegex) private def matchRegex(tuple: Tuple): Boolean = - Option[Any](tuple.getField(attributeName).toString) + Option[Any](tuple.getField(desc.attribute).toString) .map(_.toString) .exists(value => pattern.matcher(value).find) } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/reservoirsampling/ReservoirSamplingOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/reservoirsampling/ReservoirSamplingOpDesc.scala index 79ab7eadf1e..cc1840609bf 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/reservoirsampling/ReservoirSamplingOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/reservoirsampling/ReservoirSamplingOpDesc.scala @@ -2,16 +2,14 @@ package edu.uci.ics.amber.operator.reservoirsampling import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.google.common.base.Preconditions -import edu.uci.ics.amber.core.executor.OpExecInitInfo +import edu.uci.ics.amber.core.executor.OpExecWithClassName import edu.uci.ics.amber.core.tuple.Schema import edu.uci.ics.amber.core.workflow.PhysicalOp import edu.uci.ics.amber.operator.LogicalOp import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} -import edu.uci.ics.amber.operator.util.OperatorDescriptorUtils.equallyPartitionGoal - -import scala.util.Random +import edu.uci.ics.amber.util.JSONUtils.objectMapper class ReservoirSamplingOpDesc extends LogicalOp { @@ -28,12 +26,9 @@ class ReservoirSamplingOpDesc extends LogicalOp { workflowId, executionId, operatorIdentifier, - OpExecInitInfo((idx, workerCount) => - new ReservoirSamplingOpExec( - idx, - equallyPartitionGoal(k, workerCount), - Array.fill(workerCount)(Random.nextInt()) - ) + OpExecWithClassName( + "edu.uci.ics.amber.operator.reservoirsampling.ReservoirSamplingOpExec", + objectMapper.writeValueAsString(this) ) ) .withInputPorts(operatorInfo.inputPorts) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/reservoirsampling/ReservoirSamplingOpExec.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/reservoirsampling/ReservoirSamplingOpExec.scala index 9e7f7c8cc14..7382e410dc9 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/reservoirsampling/ReservoirSamplingOpExec.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/reservoirsampling/ReservoirSamplingOpExec.scala @@ -2,22 +2,27 @@ package edu.uci.ics.amber.operator.reservoirsampling import edu.uci.ics.amber.core.executor.OperatorExecutor import edu.uci.ics.amber.core.tuple.{Tuple, TupleLike} +import edu.uci.ics.amber.operator.util.OperatorDescriptorUtils.equallyPartitionGoal +import edu.uci.ics.amber.util.JSONUtils.objectMapper import scala.util.Random -class ReservoirSamplingOpExec(actor: Int, kPerActor: Int => Int, seedFunc: Int => Int) +class ReservoirSamplingOpExec(descString: String, idx: Int, workerCount: Int) extends OperatorExecutor { + private val desc: ReservoirSamplingOpDesc = + objectMapper.readValue(descString, classOf[ReservoirSamplingOpDesc]) + private val count: Int = equallyPartitionGoal(desc.k, workerCount)(idx) private var n: Int = 0 - private val reservoir: Array[Tuple] = Array.ofDim(kPerActor(actor)) - private val rand: Random = new Random(seedFunc(actor)) + private val reservoir: Array[Tuple] = Array.ofDim(count) + private val rand: Random = new Random(workerCount) override def processTuple(tuple: Tuple, port: Int): Iterator[TupleLike] = { - if (n < kPerActor(actor)) { + if (n < count) { reservoir(n) = tuple } else { val i = rand.nextInt(n) - if (i < kPerActor(actor)) { + if (i < count) { reservoir(i) = tuple } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sentiment/SentimentAnalysisOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sentiment/SentimentAnalysisOpDesc.scala index 247e7893f23..815b08bdb2a 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sentiment/SentimentAnalysisOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sentiment/SentimentAnalysisOpDesc.scala @@ -2,12 +2,13 @@ package edu.uci.ics.amber.operator.sentiment import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaInject -import edu.uci.ics.amber.core.executor.OpExecInitInfo +import edu.uci.ics.amber.core.executor.OpExecWithClassName import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} import edu.uci.ics.amber.core.workflow.{PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.operator.map.MapOpDesc import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName +import edu.uci.ics.amber.util.JSONUtils.objectMapper import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} @@ -46,7 +47,10 @@ class SentimentAnalysisOpDesc extends MapOpDesc { workflowId, executionId, operatorIdentifier, - OpExecInitInfo((_, _) => new SentimentAnalysisOpExec(attribute)) + OpExecWithClassName( + "edu.uci.ics.amber.operator.sentiment.SentimentAnalysisOpExec", + objectMapper.writeValueAsString(this) + ) ) .withInputPorts(operatorInfo.inputPorts) .withOutputPorts(operatorInfo.outputPorts) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sentiment/SentimentAnalysisOpExec.java b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sentiment/SentimentAnalysisOpExec.java index 038e12e461f..df907bae693 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sentiment/SentimentAnalysisOpExec.java +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sentiment/SentimentAnalysisOpExec.java @@ -1,6 +1,7 @@ package edu.uci.ics.amber.operator.sentiment; +import com.fasterxml.jackson.core.JsonProcessingException; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.neural.rnn.RNNCoreAnnotations; import edu.stanford.nlp.pipeline.Annotation; @@ -10,6 +11,7 @@ import edu.uci.ics.amber.core.tuple.Tuple; import edu.uci.ics.amber.core.tuple.TupleLike; import edu.uci.ics.amber.operator.map.MapOpExec; +import edu.uci.ics.amber.util.JSONUtils; import scala.Function1; import java.io.Serializable; @@ -19,8 +21,9 @@ public class SentimentAnalysisOpExec extends MapOpExec { private final String attributeName; private final StanfordCoreNLPWrapper coreNlp; - public SentimentAnalysisOpExec(String attributeName) { - this.attributeName = attributeName; + public SentimentAnalysisOpExec(String descString) throws JsonProcessingException { + SentimentAnalysisOpDesc desc = JSONUtils.objectMapper().readValue(descString, SentimentAnalysisOpDesc.class); + this.attributeName = desc.attribute(); Properties props = new Properties(); props.setProperty("annotators", "tokenize, ssplit, parse, sentiment"); coreNlp = new StanfordCoreNLPWrapper(props); diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sink/managed/ProgressiveSinkOpExec.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sink/ProgressiveSinkOpExec.scala similarity index 89% rename from core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sink/managed/ProgressiveSinkOpExec.scala rename to core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sink/ProgressiveSinkOpExec.scala index bd0bd187c23..9c9b2e7fad1 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sink/managed/ProgressiveSinkOpExec.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sink/ProgressiveSinkOpExec.scala @@ -1,10 +1,9 @@ -package edu.uci.ics.amber.operator.sink.managed +package edu.uci.ics.amber.operator.sink import edu.uci.ics.amber.core.executor.SinkOperatorExecutor import edu.uci.ics.amber.core.storage.model.BufferedItemWriter import edu.uci.ics.amber.core.storage.result.ResultStorage import edu.uci.ics.amber.core.tuple.{Tuple, TupleLike} -import edu.uci.ics.amber.operator.sink.ProgressiveUtils import edu.uci.ics.amber.core.virtualidentity.WorkflowIdentity import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode import edu.uci.ics.amber.core.workflow.PortIdentity @@ -15,7 +14,10 @@ class ProgressiveSinkOpExec( workflowIdentity: WorkflowIdentity ) extends SinkOperatorExecutor { val writer: BufferedItemWriter[Tuple] = - ResultStorage.getOpResultStorage(workflowIdentity).get(storageKey).writer() + ResultStorage + .getOpResultStorage(workflowIdentity) + .get(storageKey) + .writer() override def open(): Unit = { writer.open() diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sortPartitions/SortPartitionsOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sortPartitions/SortPartitionsOpDesc.scala index 73366908cb1..6c06d95dadc 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sortPartitions/SortPartitionsOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sortPartitions/SortPartitionsOpDesc.scala @@ -3,12 +3,13 @@ package edu.uci.ics.amber.operator.sortPartitions import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.google.common.base.Preconditions import com.kjetland.jackson.jsonSchema.annotations.{JsonSchemaInject, JsonSchemaTitle} -import edu.uci.ics.amber.core.executor.OpExecInitInfo +import edu.uci.ics.amber.core.executor.OpExecWithClassName import edu.uci.ics.amber.core.tuple.Schema import edu.uci.ics.amber.core.workflow.{PhysicalOp, RangePartition} import edu.uci.ics.amber.operator.LogicalOp import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} +import edu.uci.ics.amber.util.JSONUtils.objectMapper import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} @@ -48,15 +49,9 @@ class SortPartitionsOpDesc extends LogicalOp { workflowId, executionId, operatorIdentifier, - OpExecInitInfo(opExecFunc = - (idx, workerCount) => - new SortPartitionOpExec( - sortAttributeName, - idx, - domainMin, - domainMax, - workerCount - ) + OpExecWithClassName( + "edu.uci.ics.amber.operator.sortPartitions.SortPartitionsOpExec", + objectMapper.writeValueAsString(this) ) ) .withInputPorts(operatorInfo.inputPorts) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sortPartitions/SortPartitionOpExec.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sortPartitions/SortPartitionsOpExec.scala similarity index 77% rename from core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sortPartitions/SortPartitionOpExec.scala rename to core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sortPartitions/SortPartitionsOpExec.scala index fd52c34d0d0..df773dac8e4 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sortPartitions/SortPartitionOpExec.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sortPartitions/SortPartitionsOpExec.scala @@ -2,17 +2,15 @@ package edu.uci.ics.amber.operator.sortPartitions import edu.uci.ics.amber.core.executor.OperatorExecutor import edu.uci.ics.amber.core.tuple.{AttributeType, Tuple, TupleLike} +import edu.uci.ics.amber.util.JSONUtils.objectMapper import scala.collection.mutable.ArrayBuffer -class SortPartitionOpExec( - sortAttributeName: String, - localIdx: Int, - domainMin: Long, - domainMax: Long, - numberOfWorkers: Int +class SortPartitionsOpExec( + descString: String ) extends OperatorExecutor { - + private val desc: SortPartitionsOpDesc = + objectMapper.readValue(descString, classOf[SortPartitionsOpDesc]) private var unorderedTuples: ArrayBuffer[Tuple] = _ private def sortTuples(): Iterator[TupleLike] = unorderedTuples.sortWith(compareTuples).iterator @@ -25,8 +23,8 @@ class SortPartitionOpExec( override def onFinish(port: Int): Iterator[TupleLike] = sortTuples() private def compareTuples(t1: Tuple, t2: Tuple): Boolean = { - val attributeType = t1.getSchema.getAttribute(sortAttributeName).getType - val attributeIndex = t1.getSchema.getIndex(sortAttributeName) + val attributeType = t1.getSchema.getAttribute(desc.sortAttributeName).getType + val attributeIndex = t1.getSchema.getIndex(desc.sortAttributeName) attributeType match { case AttributeType.LONG => t1.getField[Long](attributeIndex) < t2.getField[Long](attributeIndex) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/SourceOperatorDescriptor.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/SourceOperatorDescriptor.scala index 3d2d18eac50..ad36af84dbb 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/SourceOperatorDescriptor.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/SourceOperatorDescriptor.scala @@ -6,11 +6,10 @@ import edu.uci.ics.amber.operator.LogicalOp abstract class SourceOperatorDescriptor extends LogicalOp { + def sourceSchema(): Schema + override def getOutputSchema(schemas: Array[Schema]): Schema = { Preconditions.checkArgument(schemas.isEmpty) sourceSchema() } - - def sourceSchema(): Schema - } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/apis/twitter/TwitterSourceOpExec.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/apis/twitter/TwitterSourceOpExec.scala index 8d9013d7f66..67eed71cd21 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/apis/twitter/TwitterSourceOpExec.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/apis/twitter/TwitterSourceOpExec.scala @@ -1,14 +1,15 @@ package edu.uci.ics.amber.operator.source.apis.twitter import edu.uci.ics.amber.core.executor.SourceOperatorExecutor +import edu.uci.ics.amber.util.JSONUtils.objectMapper import io.github.redouane59.twitter.TwitterClient import io.github.redouane59.twitter.signature.TwitterCredentials abstract class TwitterSourceOpExec( - apiKey: String, - apiSecretKey: String, - stopWhenRateLimited: Boolean + descString: String ) extends SourceOperatorExecutor { + private val desc: TwitterSourceOpDesc = + objectMapper.readValue(descString, classOf[TwitterSourceOpDesc]) // batch size for each API request defined by Twitter // 500 is the maximum tweets for each request val TWITTER_API_BATCH_SIZE_MAX = 500 @@ -28,11 +29,11 @@ abstract class TwitterSourceOpExec( twitterClient = new TwitterClient( TwitterCredentials .builder() - .apiKey(apiKey) - .apiSecretKey(apiSecretKey) + .apiKey(desc.apiKey) + .apiSecretKey(desc.apiSecretKey) .build() ) - twitterClient.setAutomaticRetry(!stopWhenRateLimited) + twitterClient.setAutomaticRetry(!desc.stopWhenRateLimited) } override def close(): Unit = {} diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/apis/twitter/v2/TwitterFullArchiveSearchSourceOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/apis/twitter/v2/TwitterFullArchiveSearchSourceOpDesc.scala index 5b17a08d23a..c3a92cbcadd 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/apis/twitter/v2/TwitterFullArchiveSearchSourceOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/apis/twitter/v2/TwitterFullArchiveSearchSourceOpDesc.scala @@ -6,11 +6,12 @@ import com.kjetland.jackson.jsonSchema.annotations.{ JsonSchemaInject, JsonSchemaTitle } -import edu.uci.ics.amber.core.executor.OpExecInitInfo +import edu.uci.ics.amber.core.executor.OpExecWithClassName import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} import edu.uci.ics.amber.core.workflow.{PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.operator.metadata.annotations.UIWidget import edu.uci.ics.amber.operator.source.apis.twitter.TwitterSourceOpDesc +import edu.uci.ics.amber.util.JSONUtils.objectMapper import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} class TwitterFullArchiveSearchSourceOpDesc extends TwitterSourceOpDesc { @@ -49,17 +50,9 @@ class TwitterFullArchiveSearchSourceOpDesc extends TwitterSourceOpDesc { workflowId, executionId, operatorIdentifier, - OpExecInitInfo((_, _) => - new TwitterFullArchiveSearchSourceOpExec( - apiKey, - apiSecretKey, - stopWhenRateLimited, - searchQuery, - limit, - fromDateTime, - toDateTime, - () => sourceSchema() - ) + OpExecWithClassName( + "edu.uci.ics.amber.operator.source.apis.twitter.v2.TwitterFullArchiveSearchSourceOpExec", + objectMapper.writeValueAsString(this) ) ) .withInputPorts(operatorInfo.inputPorts) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/apis/twitter/v2/TwitterFullArchiveSearchSourceOpExec.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/apis/twitter/v2/TwitterFullArchiveSearchSourceOpExec.scala index 023a3f5337f..af9fd0f2e3b 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/apis/twitter/v2/TwitterFullArchiveSearchSourceOpExec.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/apis/twitter/v2/TwitterFullArchiveSearchSourceOpExec.scala @@ -3,6 +3,7 @@ package edu.uci.ics.amber.operator.source.apis.twitter.v2 import edu.uci.ics.amber.core.tuple.{Schema, Tuple, TupleLike} import edu.uci.ics.amber.operator.source.apis.twitter.TwitterSourceOpExec import edu.uci.ics.amber.operator.source.apis.twitter.v2.TwitterUtils.tweetDataToTuple +import edu.uci.ics.amber.util.JSONUtils.objectMapper import io.github.redouane59.twitter.dto.endpoints.AdditionalParameters import io.github.redouane59.twitter.dto.tweet.TweetList import io.github.redouane59.twitter.dto.tweet.TweetV2.TweetData @@ -15,18 +16,11 @@ import scala.collection.{Iterator, mutable} import scala.jdk.CollectionConverters.ListHasAsScala class TwitterFullArchiveSearchSourceOpExec( - apiKey: String, - apiSecretKey: String, - stopWhenRateLimited: Boolean, - searchQuery: String, - limit: Int, - fromDateTime: String, - toDateTime: String, - schemaFunc: () => Schema -) extends TwitterSourceOpExec(apiKey, apiSecretKey, stopWhenRateLimited) { - val outputSchema: Schema = schemaFunc() - - var curLimit: Int = limit + descString: String +) extends TwitterSourceOpExec(descString) { + private val desc: TwitterFullArchiveSearchSourceOpDesc = + objectMapper.readValue(descString, classOf[TwitterFullArchiveSearchSourceOpDesc]) + var curLimit: Int = desc.limit // nextToken is used to retrieve next page of results, if exists. var nextToken: String = _ // contains tweets from the previous request. @@ -34,6 +28,7 @@ class TwitterFullArchiveSearchSourceOpExec( var userCache: Map[String, UserData] = Map() var hasNextRequest: Boolean = curLimit > 0 var lastQueryTime: Long = 0 + val schema: Schema = desc.sourceSchema() override def produceTuple(): Iterator[TupleLike] = new Iterator[TupleLike]() { @@ -43,9 +38,9 @@ class TwitterFullArchiveSearchSourceOpExec( // if the current cache is exhausted, query for the next response if (tweetCache.isEmpty && hasNextRequest) { queryForNextBatch( - searchQuery, - LocalDateTime.parse(fromDateTime, DateTimeFormatter.ISO_DATE_TIME), - LocalDateTime.parse(toDateTime, DateTimeFormatter.ISO_DATE_TIME), + desc.searchQuery, + LocalDateTime.parse(desc.fromDateTime, DateTimeFormatter.ISO_DATE_TIME), + LocalDateTime.parse(desc.toDateTime, DateTimeFormatter.ISO_DATE_TIME), curLimit.min(TWITTER_API_BATCH_SIZE_MAX) ) } @@ -65,7 +60,7 @@ class TwitterFullArchiveSearchSourceOpExec( val user = userCache.get(tweet.getAuthorId) - tweetDataToTuple(tweet, user, outputSchema) + tweetDataToTuple(tweet, user, schema) } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/apis/twitter/v2/TwitterSearchSourceOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/apis/twitter/v2/TwitterSearchSourceOpDesc.scala index 39d5bd697bb..15b0ddfaf21 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/apis/twitter/v2/TwitterSearchSourceOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/apis/twitter/v2/TwitterSearchSourceOpDesc.scala @@ -6,11 +6,12 @@ import com.kjetland.jackson.jsonSchema.annotations.{ JsonSchemaInject, JsonSchemaTitle } -import edu.uci.ics.amber.core.executor.OpExecInitInfo +import edu.uci.ics.amber.core.executor.OpExecWithClassName import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} import edu.uci.ics.amber.core.workflow.{PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.operator.metadata.annotations.UIWidget import edu.uci.ics.amber.operator.source.apis.twitter.TwitterSourceOpDesc +import edu.uci.ics.amber.util.JSONUtils.objectMapper import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} class TwitterSearchSourceOpDesc extends TwitterSourceOpDesc { @@ -39,15 +40,9 @@ class TwitterSearchSourceOpDesc extends TwitterSourceOpDesc { workflowId, executionId, operatorIdentifier, - OpExecInitInfo((_, _) => - new TwitterSearchSourceOpExec( - apiKey, - apiSecretKey, - stopWhenRateLimited, - searchQuery, - limit, - () => sourceSchema() - ) + OpExecWithClassName( + "edu.uci.ics.amber.operator.source.apis.twitter.v2.TwitterSearchSourceOpExec", + objectMapper.writeValueAsString(this) ) ) .withInputPorts(operatorInfo.inputPorts) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/apis/twitter/v2/TwitterSearchSourceOpExec.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/apis/twitter/v2/TwitterSearchSourceOpExec.scala index 27522c99103..198c22e184b 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/apis/twitter/v2/TwitterSearchSourceOpExec.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/apis/twitter/v2/TwitterSearchSourceOpExec.scala @@ -3,6 +3,7 @@ package edu.uci.ics.amber.operator.source.apis.twitter.v2 import edu.uci.ics.amber.core.tuple.{Schema, Tuple, TupleLike} import edu.uci.ics.amber.operator.source.apis.twitter.TwitterSourceOpExec import edu.uci.ics.amber.operator.source.apis.twitter.v2.TwitterUtils.tweetDataToTuple +import edu.uci.ics.amber.util.JSONUtils.objectMapper import io.github.redouane59.twitter.dto.endpoints.AdditionalParameters import io.github.redouane59.twitter.dto.tweet.TweetList import io.github.redouane59.twitter.dto.tweet.TweetV2.TweetData @@ -13,15 +14,11 @@ import scala.collection.{Iterator, mutable} import scala.jdk.CollectionConverters.ListHasAsScala class TwitterSearchSourceOpExec( - apiKey: String, - apiSecretKey: String, - stopWhenRateLimited: Boolean, - searchQuery: String, - limit: Int, - schemaFunc: () => Schema -) extends TwitterSourceOpExec(apiKey, apiSecretKey, stopWhenRateLimited) { - val outputSchema: Schema = schemaFunc() - var curLimit: Int = limit + descString: String +) extends TwitterSourceOpExec(descString) { + private val desc: TwitterSearchSourceOpDesc = + objectMapper.readValue(descString, classOf[TwitterSearchSourceOpDesc]) + var curLimit: Int = desc.limit // nextToken is used to retrieve next page of results, if exists. var nextToken: String = _ // contains tweets from the previous request. @@ -29,6 +26,7 @@ class TwitterSearchSourceOpExec( var userCache: Map[String, UserData] = Map() var hasNextRequest: Boolean = curLimit > 0 var lastQueryTime: Long = 0 + val schema: Schema = desc.sourceSchema() override def produceTuple(): Iterator[TupleLike] = new Iterator[TupleLike]() { @@ -38,7 +36,7 @@ class TwitterSearchSourceOpExec( // if the current cache is exhausted, query for the next response if (tweetCache.isEmpty && hasNextRequest) { queryForNextBatch( - searchQuery, + desc.searchQuery, curLimit.min(TWITTER_API_BATCH_SIZE_MAX) ) } @@ -58,7 +56,7 @@ class TwitterSearchSourceOpExec( val user = userCache.get(tweet.getAuthorId) - tweetDataToTuple(tweet, user, outputSchema) + tweetDataToTuple(tweet, user, schema) } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/fetcher/URLFetcherOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/fetcher/URLFetcherOpDesc.scala index 38ce1e997e0..49f5028d718 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/fetcher/URLFetcherOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/fetcher/URLFetcherOpDesc.scala @@ -2,11 +2,12 @@ package edu.uci.ics.amber.operator.source.fetcher import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle -import edu.uci.ics.amber.core.executor.OpExecInitInfo +import edu.uci.ics.amber.core.executor.OpExecWithClassName import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} import edu.uci.ics.amber.core.workflow.{PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.source.SourceOperatorDescriptor +import edu.uci.ics.amber.util.JSONUtils.objectMapper import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import edu.uci.ics.amber.core.workflow.OutputPort @@ -26,7 +27,7 @@ class URLFetcherOpDesc extends SourceOperatorDescriptor { ) var decodingMethod: DecodingMethod = _ - def sourceSchema(): Schema = { + override def sourceSchema(): Schema = { Schema .builder() .add( @@ -49,7 +50,10 @@ class URLFetcherOpDesc extends SourceOperatorDescriptor { workflowId, executionId, operatorIdentifier, - OpExecInitInfo((_, _) => new URLFetcherOpExec(url, decodingMethod)) + OpExecWithClassName( + "edu.uci.ics.amber.operator.source.fetcher.URLFetcherOpExec", + objectMapper.writeValueAsString(this) + ) ) .withInputPorts(operatorInfo.inputPorts) .withOutputPorts(operatorInfo.outputPorts) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/fetcher/URLFetcherOpExec.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/fetcher/URLFetcherOpExec.scala index 8c61c9cabad..5c519f45aec 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/fetcher/URLFetcherOpExec.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/fetcher/URLFetcherOpExec.scala @@ -3,21 +3,22 @@ package edu.uci.ics.amber.operator.source.fetcher import edu.uci.ics.amber.core.executor.SourceOperatorExecutor import edu.uci.ics.amber.core.tuple.TupleLike import edu.uci.ics.amber.operator.source.fetcher.URLFetchUtil.getInputStreamFromURL +import edu.uci.ics.amber.util.JSONUtils.objectMapper import org.apache.commons.io.IOUtils import java.net.URL -class URLFetcherOpExec(url: String, decodingMethod: DecodingMethod) extends SourceOperatorExecutor { - +class URLFetcherOpExec(descString: String) extends SourceOperatorExecutor { + private val desc: URLFetcherOpDesc = objectMapper.readValue(descString, classOf[URLFetcherOpDesc]) override def produceTuple(): Iterator[TupleLike] = { - val urlObj = new URL(url) + val urlObj = new URL(desc.url) val input = getInputStreamFromURL(urlObj) val contentInputStream = input match { case Some(value) => value - case None => IOUtils.toInputStream(s"Fetch failed for URL: $url", "UTF-8") + case None => IOUtils.toInputStream(s"Fetch failed for URL: $desc.url", "UTF-8") } - Iterator(if (decodingMethod == DecodingMethod.UTF_8) { + Iterator(if (desc.decodingMethod == DecodingMethod.UTF_8) { TupleLike(IOUtils.toString(contentInputStream, "UTF-8")) } else { TupleLike(IOUtils.toByteArray(contentInputStream)) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/FileScanSourceOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/FileScanSourceOpDesc.scala index 5902e0e030c..90c65c87eb9 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/FileScanSourceOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/FileScanSourceOpDesc.scala @@ -6,12 +6,13 @@ import com.kjetland.jackson.jsonSchema.annotations.{ JsonSchemaString, JsonSchemaTitle } -import edu.uci.ics.amber.core.executor.OpExecInitInfo +import edu.uci.ics.amber.core.executor.OpExecWithClassName import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} +import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import edu.uci.ics.amber.core.workflow.{PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.operator.metadata.annotations.HideAnnotation import edu.uci.ics.amber.operator.source.scan.text.TextSourceOpDesc -import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} +import edu.uci.ics.amber.util.JSONUtils.objectMapper @JsonIgnoreProperties(value = Array("limit", "offset", "fileEncoding")) class FileScanSourceOpDesc extends ScanSourceOpDesc with TextSourceOpDesc { @@ -52,26 +53,19 @@ class FileScanSourceOpDesc extends ScanSourceOpDesc with TextSourceOpDesc { workflowId, executionId, operatorIdentifier, - OpExecInitInfo((_, _) => - new FileScanSourceOpExec( - fileUri.get, - attributeType, - encoding, - extract, - outputFileName, - fileScanLimit, - fileScanOffset - ) + OpExecWithClassName( + "edu.uci.ics.amber.operator.source.scan.FileScanSourceOpExec", + objectMapper.writeValueAsString(this) ) ) .withInputPorts(operatorInfo.inputPorts) .withOutputPorts(operatorInfo.outputPorts) .withPropagateSchema( - SchemaPropagationFunc(_ => Map(operatorInfo.outputPorts.head.id -> inferSchema())) + SchemaPropagationFunc(_ => Map(operatorInfo.outputPorts.head.id -> sourceSchema())) ) } - override def inferSchema(): Schema = { + override def sourceSchema(): Schema = { val builder = Schema.builder() if (outputFileName) builder.add(new Attribute("filename", AttributeType.STRING)) builder.add(new Attribute(attributeName, attributeType.getType)).build() diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/FileScanSourceOpExec.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/FileScanSourceOpExec.scala index 9bd68c67f16..1d786d31d3a 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/FileScanSourceOpExec.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/FileScanSourceOpExec.scala @@ -4,6 +4,7 @@ import edu.uci.ics.amber.core.executor.SourceOperatorExecutor import edu.uci.ics.amber.core.storage.DocumentFactory import edu.uci.ics.amber.core.tuple.AttributeTypeUtils.parseField import edu.uci.ics.amber.core.tuple.TupleLike +import edu.uci.ics.amber.util.JSONUtils.objectMapper import org.apache.commons.compress.archivers.{ArchiveInputStream, ArchiveStreamFactory} import org.apache.commons.io.IOUtils.toByteArray @@ -13,21 +14,17 @@ import scala.collection.mutable import scala.jdk.CollectionConverters.IteratorHasAsScala class FileScanSourceOpExec private[scan] ( - fileUri: String, - fileAttributeType: FileAttributeType, - fileEncoding: FileDecodingMethod, - extract: Boolean, - outputFileName: Boolean, - fileScanLimit: Option[Int] = None, - fileScanOffset: Option[Int] = None + descString: String ) extends SourceOperatorExecutor { + private val desc: FileScanSourceOpDesc = + objectMapper.readValue(descString, classOf[FileScanSourceOpDesc]) @throws[IOException] override def produceTuple(): Iterator[TupleLike] = { var filenameIt: Iterator[String] = Iterator.empty val fileEntries: Iterator[InputStream] = { - val is = DocumentFactory.newReadonlyDocument(new URI(fileUri)).asInputStream() - if (extract) { + val is = DocumentFactory.newReadonlyDocument(new URI(desc.fileName.get)).asInputStream() + if (desc.extract) { val inputStream: ArchiveInputStream = new ArchiveStreamFactory().createArchiveInputStream( new BufferedInputStream(is) ) @@ -43,34 +40,34 @@ class FileScanSourceOpExec private[scan] ( } } - if (fileAttributeType.isSingle) { + if (desc.attributeType.isSingle) { fileEntries.zipAll(filenameIt, null, null).map { case (entry, fileName) => val fields: mutable.ListBuffer[Any] = mutable.ListBuffer() - if (outputFileName) { + if (desc.outputFileName) { fields.addOne(fileName) } - fields.addOne(fileAttributeType match { + fields.addOne(desc.attributeType match { case FileAttributeType.SINGLE_STRING => - new String(toByteArray(entry), fileEncoding.getCharset) - case _ => parseField(toByteArray(entry), fileAttributeType.getType) + new String(toByteArray(entry), desc.fileEncoding.getCharset) + case _ => parseField(toByteArray(entry), desc.attributeType.getType) }) TupleLike(fields.toSeq: _*) } } else { fileEntries.flatMap(entry => - new BufferedReader(new InputStreamReader(entry, fileEncoding.getCharset)) + new BufferedReader(new InputStreamReader(entry, desc.fileEncoding.getCharset)) .lines() .iterator() .asScala .slice( - fileScanOffset.getOrElse(0), - fileScanOffset.getOrElse(0) + fileScanLimit.getOrElse(Int.MaxValue) + desc.fileScanOffset.getOrElse(0), + desc.fileScanOffset.getOrElse(0) + desc.fileScanLimit.getOrElse(Int.MaxValue) ) .map(line => { - TupleLike(fileAttributeType match { + TupleLike(desc.attributeType match { case FileAttributeType.SINGLE_STRING => line - case _ => parseField(line, fileAttributeType.getType) + case _ => parseField(line, desc.attributeType.getType) }) }) ) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/ScanSourceOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/ScanSourceOpDesc.scala index 919c35f6cb4..db2fefdac60 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/ScanSourceOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/ScanSourceOpDesc.scala @@ -3,6 +3,7 @@ package edu.uci.ics.amber.operator.source.scan import com.fasterxml.jackson.annotation.{JsonIgnore, JsonProperty, JsonPropertyDescription} import com.fasterxml.jackson.databind.annotation.JsonDeserialize import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle +import edu.uci.ics.amber.core.storage.FileResolver import edu.uci.ics.amber.core.tuple.Schema import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.source.SourceOperatorDescriptor @@ -29,10 +30,6 @@ abstract class ScanSourceOpDesc extends SourceOperatorDescriptor { @JsonPropertyDescription("decoding charset to use on input") var fileEncoding: FileDecodingMethod = FileDecodingMethod.UTF_8 - // uri of the file - @JsonIgnore - var fileUri: Option[String] = None - @JsonIgnore var fileTypeName: Option[String] = None @@ -48,10 +45,7 @@ abstract class ScanSourceOpDesc extends SourceOperatorDescriptor { @JsonDeserialize(contentAs = classOf[Int]) var offset: Option[Int] = None - override def sourceSchema(): Schema = { - if (fileUri.isEmpty) return null - inferSchema() - } + override def sourceSchema(): Schema = null override def operatorInfo: OperatorInfo = { OperatorInfo( @@ -63,12 +57,12 @@ abstract class ScanSourceOpDesc extends SourceOperatorDescriptor { ) } - def inferSchema(): Schema - - def setFileUri(uri: URI): Unit = { - fileUri = Some(uri.toASCIIString) + def setResolvedFileName(uri: URI): Unit = { + fileName = Some(uri.toASCIIString) } override def equals(that: Any): Boolean = EqualsBuilder.reflectionEquals(this, that, "context", "fileHandle") + + def fileResolved(): Boolean = fileName.isDefined && FileResolver.isFileResolved(fileName.get) } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/arrow/ArrowSourceOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/arrow/ArrowSourceOpDesc.scala index 96f48fdb75d..135ae2b6657 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/arrow/ArrowSourceOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/arrow/ArrowSourceOpDesc.scala @@ -1,13 +1,14 @@ package edu.uci.ics.amber.operator.source.scan.arrow import com.fasterxml.jackson.annotation.JsonIgnoreProperties -import edu.uci.ics.amber.core.executor.OpExecInitInfo +import edu.uci.ics.amber.core.executor.OpExecWithClassName import edu.uci.ics.amber.core.storage.DocumentFactory import edu.uci.ics.amber.core.tuple.Schema import edu.uci.ics.amber.core.workflow.{PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.operator.source.scan.ScanSourceOpDesc import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import edu.uci.ics.amber.util.ArrowUtils +import edu.uci.ics.amber.util.JSONUtils.objectMapper import java.io.IOException import java.net.URI @@ -16,6 +17,7 @@ import java.nio.file.StandardOpenOption import org.apache.arrow.memory.RootAllocator import org.apache.arrow.vector.ipc.ArrowFileReader import org.apache.arrow.vector.types.pojo.{Schema => ArrowSchema} + import scala.util.Using @JsonIgnoreProperties(value = Array("fileEncoding")) @@ -33,7 +35,10 @@ class ArrowSourceOpDesc extends ScanSourceOpDesc { workflowId, executionId, operatorIdentifier, - OpExecInitInfo((_, _) => createArrowSourceOpExec()) + OpExecWithClassName( + "edu.uci.ics.amber.operator.source.scan.arrow.ArrowSourceOpExec", + objectMapper.writeValueAsString(this) + ) ) .withInputPorts(operatorInfo.inputPorts) .withOutputPorts(operatorInfo.outputPorts) @@ -42,15 +47,6 @@ class ArrowSourceOpDesc extends ScanSourceOpDesc { ) } - private def createArrowSourceOpExec() = { - new ArrowSourceOpExec( - fileUri.get, - limit, - offset, - schemaFunc = () => sourceSchema() - ) - } - /** * Infer Texera.Schema based on the top few lines of data. * @@ -58,7 +54,7 @@ class ArrowSourceOpDesc extends ScanSourceOpDesc { */ @Override def inferSchema(): Schema = { - val file = DocumentFactory.newReadonlyDocument(new URI(fileUri.get)).asFile() + val file = DocumentFactory.newReadonlyDocument(new URI(fileName.get)).asFile() val allocator = new RootAllocator() Using diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/arrow/ArrowSourceOpExec.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/arrow/ArrowSourceOpExec.scala index d359b847f8a..548d0734bb8 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/arrow/ArrowSourceOpExec.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/arrow/ArrowSourceOpExec.scala @@ -2,37 +2,33 @@ package edu.uci.ics.amber.operator.source.scan.arrow import edu.uci.ics.amber.core.executor.SourceOperatorExecutor import edu.uci.ics.amber.core.storage.DocumentFactory +import edu.uci.ics.amber.core.tuple.TupleLike +import edu.uci.ics.amber.util.ArrowUtils +import edu.uci.ics.amber.util.JSONUtils.objectMapper import org.apache.arrow.memory.RootAllocator import org.apache.arrow.vector.VectorSchemaRoot import org.apache.arrow.vector.ipc.ArrowFileReader -import edu.uci.ics.amber.core.tuple.{Schema, TupleLike} -import edu.uci.ics.amber.util.ArrowUtils import java.net.URI -import java.nio.file.{Files} -import java.nio.file.StandardOpenOption +import java.nio.file.{Files, StandardOpenOption} class ArrowSourceOpExec( - fileUri: String, - limit: Option[Int], - offset: Option[Int], - schemaFunc: () => Schema + descString: String ) extends SourceOperatorExecutor { - + private val desc: ArrowSourceOpDesc = + objectMapper.readValue(descString, classOf[ArrowSourceOpDesc]) private var reader: Option[ArrowFileReader] = None private var root: Option[VectorSchemaRoot] = None - private var schema: Option[Schema] = None private var allocator: Option[RootAllocator] = None override def open(): Unit = { try { - val file = DocumentFactory.newReadonlyDocument(new URI(fileUri)).asFile() + val file = DocumentFactory.newReadonlyDocument(new URI(desc.fileName.get)).asFile() val alloc = new RootAllocator() allocator = Some(alloc) val channel = Files.newByteChannel(file.toPath, StandardOpenOption.READ) val arrowReader = new ArrowFileReader(channel, alloc) val vectorRoot = arrowReader.getVectorSchemaRoot - schema = Some(schemaFunc()) reader = Some(arrowReader) root = Some(vectorRoot) } catch { @@ -73,8 +69,8 @@ class ArrowSourceOpExec( } } - var tupleIterator = rowIterator.drop(offset.getOrElse(0)) - if (limit.isDefined) tupleIterator = tupleIterator.take(limit.get) + var tupleIterator = rowIterator.drop(desc.offset.getOrElse(0)) + if (desc.limit.isDefined) tupleIterator = tupleIterator.take(desc.limit.get) tupleIterator } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/csv/CSVScanSourceOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/csv/CSVScanSourceOpDesc.scala index 6d9fc7a5d22..cd2fdda4bdf 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/csv/CSVScanSourceOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/csv/CSVScanSourceOpDesc.scala @@ -1,15 +1,15 @@ package edu.uci.ics.amber.operator.source.scan.csv -import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} -import com.fasterxml.jackson.databind.annotation.JsonDeserialize +import com.fasterxml.jackson.annotation.{JsonInclude, JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle import com.univocity.parsers.csv.{CsvFormat, CsvParser, CsvParserSettings} -import edu.uci.ics.amber.core.executor.OpExecInitInfo +import edu.uci.ics.amber.core.executor.OpExecWithClassName import edu.uci.ics.amber.core.storage.DocumentFactory import edu.uci.ics.amber.core.tuple.AttributeTypeUtils.inferSchemaFromRows import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} import edu.uci.ics.amber.core.workflow.{PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.operator.source.scan.ScanSourceOpDesc +import edu.uci.ics.amber.util.JSONUtils.objectMapper import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import java.io.{IOException, InputStreamReader} @@ -20,7 +20,7 @@ class CSVScanSourceOpDesc extends ScanSourceOpDesc { @JsonProperty(defaultValue = ",") @JsonSchemaTitle("Delimiter") @JsonPropertyDescription("delimiter to separate each line into fields") - @JsonDeserialize(contentAs = classOf[java.lang.String]) + @JsonInclude(JsonInclude.Include.NON_ABSENT) var customDelimiter: Option[String] = None @JsonProperty(defaultValue = "true") @@ -36,45 +36,32 @@ class CSVScanSourceOpDesc extends ScanSourceOpDesc { executionId: ExecutionIdentity ): PhysicalOp = { // fill in default values - if (customDelimiter.isEmpty || customDelimiter.get.isEmpty) + if (customDelimiter.isEmpty || customDelimiter.get.isEmpty) { customDelimiter = Option(",") + } PhysicalOp .sourcePhysicalOp( workflowId, executionId, operatorIdentifier, - OpExecInitInfo((_, _) => - new CSVScanSourceOpExec( - fileUri.get, - fileEncoding, - limit, - offset, - customDelimiter, - hasHeader, - schemaFunc = () => sourceSchema() - ) + OpExecWithClassName( + "edu.uci.ics.amber.operator.source.scan.csv.CSVScanSourceOpExec", + objectMapper.writeValueAsString(this) ) ) .withInputPorts(operatorInfo.inputPorts) .withOutputPorts(operatorInfo.outputPorts) .withPropagateSchema( - SchemaPropagationFunc(_ => Map(operatorInfo.outputPorts.head.id -> inferSchema())) + SchemaPropagationFunc(_ => Map(operatorInfo.outputPorts.head.id -> sourceSchema())) ) } - /** - * Infer Texera.Schema based on the top few lines of data. - * - * @return Texera.Schema build for this operator - */ - @Override - def inferSchema(): Schema = { - if (customDelimiter.isEmpty || fileUri.isEmpty) { + override def sourceSchema(): Schema = { + if (customDelimiter.isEmpty || !fileResolved()) { return null } - - val stream = DocumentFactory.newReadonlyDocument(new URI(fileUri.get)).asInputStream() + val stream = DocumentFactory.newReadonlyDocument(new URI(fileName.get)).asInputStream() val inputReader = new InputStreamReader(stream, fileEncoding.getCharset) @@ -111,6 +98,7 @@ class CSVScanSourceOpDesc extends ScanSourceOpDesc { .builder() .add(header.indices.map(i => new Attribute(header(i), attributeTypeList(i)))) .build() + } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/csv/CSVScanSourceOpExec.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/csv/CSVScanSourceOpExec.scala index b22182f47f9..a111219440c 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/csv/CSVScanSourceOpExec.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/csv/CSVScanSourceOpExec.scala @@ -3,25 +3,17 @@ package edu.uci.ics.amber.operator.source.scan.csv import com.univocity.parsers.csv.{CsvFormat, CsvParser, CsvParserSettings} import edu.uci.ics.amber.core.executor.SourceOperatorExecutor import edu.uci.ics.amber.core.storage.DocumentFactory -import edu.uci.ics.amber.core.tuple.{AttributeTypeUtils, Schema, TupleLike} -import edu.uci.ics.amber.operator.source.scan.FileDecodingMethod +import edu.uci.ics.amber.core.tuple.{AttributeTypeUtils, TupleLike} +import edu.uci.ics.amber.util.JSONUtils.objectMapper import java.io.InputStreamReader import java.net.URI import scala.collection.immutable.ArraySeq -class CSVScanSourceOpExec private[csv] ( - fileUri: String, - fileEncoding: FileDecodingMethod, - limit: Option[Int], - offset: Option[Int], - customDelimiter: Option[String], - hasHeader: Boolean, - schemaFunc: () => Schema -) extends SourceOperatorExecutor { +class CSVScanSourceOpExec private[csv] (descString: String) extends SourceOperatorExecutor { + val desc: CSVScanSourceOpDesc = objectMapper.readValue(descString, classOf[CSVScanSourceOpDesc]) var inputReader: InputStreamReader = _ var parser: CsvParser = _ - var schema: Schema = _ var nextRow: Array[String] = _ var numRowGenerated = 0 @@ -45,12 +37,12 @@ class CSVScanSourceOpExec private[csv] ( } var tupleIterator = rowIterator - .drop(offset.getOrElse(0)) + .drop(desc.offset.getOrElse(0)) .map(row => { try { TupleLike( ArraySeq.unsafeWrapArray( - AttributeTypeUtils.parseFields(row.asInstanceOf[Array[Any]], schema) + AttributeTypeUtils.parseFields(row.asInstanceOf[Array[Any]], desc.sourceSchema()) ): _* ) } catch { @@ -59,19 +51,19 @@ class CSVScanSourceOpExec private[csv] ( }) .filter(t => t != null) - if (limit.isDefined) tupleIterator = tupleIterator.take(limit.get) + if (desc.limit.isDefined) tupleIterator = tupleIterator.take(desc.limit.get) tupleIterator } override def open(): Unit = { inputReader = new InputStreamReader( - DocumentFactory.newReadonlyDocument(new URI(fileUri)).asInputStream(), - fileEncoding.getCharset + DocumentFactory.newReadonlyDocument(new URI(desc.fileName.get)).asInputStream(), + desc.fileEncoding.getCharset ) val csvFormat = new CsvFormat() - csvFormat.setDelimiter(customDelimiter.get.charAt(0)) + csvFormat.setDelimiter(desc.customDelimiter.get.charAt(0)) csvFormat.setLineSeparator("\n") csvFormat.setComment( '\u0000' @@ -79,12 +71,10 @@ class CSVScanSourceOpExec private[csv] ( val csvSetting = new CsvParserSettings() csvSetting.setMaxCharsPerColumn(-1) csvSetting.setFormat(csvFormat) - csvSetting.setHeaderExtractionEnabled(hasHeader) + csvSetting.setHeaderExtractionEnabled(desc.hasHeader) parser = new CsvParser(csvSetting) parser.beginParsing(inputReader) - - schema = schemaFunc() } override def close(): Unit = { diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/csv/ParallelCSVScanSourceOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/csv/ParallelCSVScanSourceOpDesc.scala index eb50cbe0910..4d4202da703 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/csv/ParallelCSVScanSourceOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/csv/ParallelCSVScanSourceOpDesc.scala @@ -4,13 +4,14 @@ import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.fasterxml.jackson.databind.annotation.JsonDeserialize import com.github.tototoshi.csv.{CSVReader, DefaultCSVFormat} import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle -import edu.uci.ics.amber.core.executor.OpExecInitInfo +import edu.uci.ics.amber.core.executor.OpExecWithClassName import edu.uci.ics.amber.core.storage.DocumentFactory import edu.uci.ics.amber.core.tuple.AttributeTypeUtils.inferSchemaFromRows import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} import edu.uci.ics.amber.core.workflow.{PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import edu.uci.ics.amber.operator.source.scan.ScanSourceOpDesc +import edu.uci.ics.amber.util.JSONUtils.objectMapper import java.io.IOException import java.net.URI @@ -36,54 +37,33 @@ class ParallelCSVScanSourceOpDesc extends ScanSourceOpDesc { executionId: ExecutionIdentity ): PhysicalOp = { // fill in default values - if (customDelimiter.get.isEmpty) + if (customDelimiter.get.isEmpty) { customDelimiter = Option(",") - - // here, the stream requires to be seekable, so datasetFileDesc creates a temp file here - // TODO: consider a better way - val file = DocumentFactory.newReadonlyDocument(new URI(fileUri.get)).asFile() - val totalBytes: Long = file.length() + } PhysicalOp .sourcePhysicalOp( workflowId, executionId, operatorIdentifier, - OpExecInitInfo((idx, workerCount) => { - // TODO: add support for limit - // TODO: add support for offset - val startOffset: Long = totalBytes / workerCount * idx - val endOffset: Long = - if (idx != workerCount - 1) totalBytes / workerCount * (idx + 1) else totalBytes - new ParallelCSVScanSourceOpExec( - file, - customDelimiter, - hasHeader, - startOffset, - endOffset, - schemaFunc = () => sourceSchema() - ) - }) + OpExecWithClassName( + "edu.uci.ics.amber.operator.source.scan.csv.ParallelCSVScanSourceOpExec", + objectMapper.writeValueAsString(this) + ) ) .withInputPorts(operatorInfo.inputPorts) .withOutputPorts(operatorInfo.outputPorts) .withParallelizable(true) .withPropagateSchema( - SchemaPropagationFunc(_ => Map(operatorInfo.outputPorts.head.id -> inferSchema())) + SchemaPropagationFunc(_ => Map(operatorInfo.outputPorts.head.id -> sourceSchema())) ) } - /** - * Infer Texera.Schema based on the top few lines of data. - * - * @return Texera.Schema build for this operator - */ - @Override - def inferSchema(): Schema = { - if (customDelimiter.isEmpty || fileUri.isEmpty) { + override def sourceSchema(): Schema = { + if (customDelimiter.isEmpty || !fileResolved()) { return null } - val file = DocumentFactory.newReadonlyDocument(new URI(fileUri.get)).asFile() + val file = DocumentFactory.newReadonlyDocument(new URI(fileName.get)).asFile() implicit object CustomFormat extends DefaultCSVFormat { override val delimiter: Char = customDelimiter.get.charAt(0) @@ -118,6 +98,7 @@ class ParallelCSVScanSourceOpDesc extends ScanSourceOpDesc { ) ) .build() + } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/csv/ParallelCSVScanSourceOpExec.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/csv/ParallelCSVScanSourceOpExec.scala index 59ba08b0169..9bdde254c79 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/csv/ParallelCSVScanSourceOpExec.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/csv/ParallelCSVScanSourceOpExec.scala @@ -1,24 +1,24 @@ package edu.uci.ics.amber.operator.source.scan.csv import edu.uci.ics.amber.core.executor.SourceOperatorExecutor -import edu.uci.ics.amber.core.tuple.{Attribute, AttributeTypeUtils, Schema, TupleLike} +import edu.uci.ics.amber.core.storage.DocumentFactory +import edu.uci.ics.amber.core.tuple.{Attribute, AttributeTypeUtils, TupleLike} import edu.uci.ics.amber.operator.source.BufferedBlockReader +import edu.uci.ics.amber.util.JSONUtils.objectMapper import org.tukaani.xz.SeekableFileInputStream -import java.io.File +import java.net.URI import java.util import java.util.stream.{IntStream, Stream} import scala.collection.compat.immutable.ArraySeq class ParallelCSVScanSourceOpExec private[csv] ( - file: File, - customDelimiter: Option[String], - hasHeader: Boolean, - startOffset: Long, - endOffset: Long, - schemaFunc: () => Schema + descString: String, + idx: Int = 0, + workerCount: Int = 1 ) extends SourceOperatorExecutor { - private var schema: Schema = _ + val desc: ParallelCSVScanSourceOpDesc = + objectMapper.readValue(descString, classOf[ParallelCSVScanSourceOpDesc]) private var reader: BufferedBlockReader = _ override def produceTuple(): Iterator[TupleLike] = @@ -42,6 +42,7 @@ class ParallelCSVScanSourceOpExec private[csv] ( return null } + val schema = desc.sourceSchema() // however the null values won't present if omitted in the end, we need to match nulls. if (fields.length != schema.getAttributes.size) fields = Stream @@ -68,19 +69,29 @@ class ParallelCSVScanSourceOpExec private[csv] ( }.filter(tuple => tuple != null) override def open(): Unit = { + // here, the stream requires to be seekable, so datasetFileDesc creates a temp file here + // TODO: consider a better way + val file = DocumentFactory.newReadonlyDocument(new URI(desc.fileName.get)).asFile() + val totalBytes: Long = file.length() + // TODO: add support for limit + // TODO: add support for offset + val startOffset: Long = totalBytes / workerCount * idx + val endOffset: Long = + if (idx != workerCount - 1) totalBytes / workerCount * (idx + 1) else totalBytes + val stream = new SeekableFileInputStream(file) - schema = schemaFunc() + stream.seek(startOffset) reader = new BufferedBlockReader( stream, endOffset - startOffset, - customDelimiter.get.charAt(0), + desc.customDelimiter.get.charAt(0), null ) // skip line if this worker reads from middle of a file if (startOffset > 0) reader.readLine // skip line if this worker reads the start of a file, and the file has a header line - if (startOffset == 0 && hasHeader) reader.readLine + if (startOffset == 0 && desc.hasHeader) reader.readLine } override def close(): Unit = reader.close() diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/csvOld/CSVOldScanSourceOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/csvOld/CSVOldScanSourceOpDesc.scala index f3b2ea1f2e6..9ea25e13147 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/csvOld/CSVOldScanSourceOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/csvOld/CSVOldScanSourceOpDesc.scala @@ -1,16 +1,16 @@ package edu.uci.ics.amber.operator.source.scan.csvOld import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} -import com.fasterxml.jackson.databind.annotation.JsonDeserialize import com.github.tototoshi.csv.{CSVReader, DefaultCSVFormat} import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle -import edu.uci.ics.amber.core.executor.OpExecInitInfo +import edu.uci.ics.amber.core.executor.OpExecWithClassName import edu.uci.ics.amber.core.storage.DocumentFactory import edu.uci.ics.amber.core.tuple.AttributeTypeUtils.inferSchemaFromRows import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} import edu.uci.ics.amber.core.workflow.{PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import edu.uci.ics.amber.operator.source.scan.ScanSourceOpDesc +import edu.uci.ics.amber.util.JSONUtils.objectMapper import java.io.IOException import java.net.URI @@ -20,8 +20,7 @@ class CSVOldScanSourceOpDesc extends ScanSourceOpDesc { @JsonProperty(defaultValue = ",") @JsonSchemaTitle("Delimiter") @JsonPropertyDescription("delimiter to separate each line into fields") - @JsonDeserialize(contentAs = classOf[java.lang.String]) - var customDelimiter: Option[String] = None + var customDelimiter: Option[String] = Some(",") @JsonProperty(defaultValue = "true") @JsonSchemaTitle("Header") @@ -36,43 +35,32 @@ class CSVOldScanSourceOpDesc extends ScanSourceOpDesc { executionId: ExecutionIdentity ): PhysicalOp = { // fill in default values - if (customDelimiter.get.isEmpty) + if (customDelimiter.get.isEmpty) { customDelimiter = Option(",") + } PhysicalOp .sourcePhysicalOp( workflowId, executionId, operatorIdentifier, - OpExecInitInfo((_, _) => - new CSVOldScanSourceOpExec( - fileUri.get, - fileEncoding, - limit, - offset, - customDelimiter, - hasHeader, - schemaFunc = () => sourceSchema() - ) + OpExecWithClassName( + "edu.uci.ics.amber.operator.source.scan.csvOld.CSVOldScanSourceOpExec", + objectMapper.writeValueAsString(this) ) ) .withInputPorts(operatorInfo.inputPorts) .withOutputPorts(operatorInfo.outputPorts) .withPropagateSchema( - SchemaPropagationFunc(_ => Map(operatorInfo.outputPorts.head.id -> inferSchema())) + SchemaPropagationFunc(_ => Map(operatorInfo.outputPorts.head.id -> sourceSchema())) ) } - /** - * Infer Texera.Schema based on the top few lines of data. - * - * @return Texera.Schema build for this operator - */ - @Override - def inferSchema(): Schema = { - if (customDelimiter.isEmpty || fileUri.isEmpty) { + override def sourceSchema(): Schema = { + if (customDelimiter.isEmpty || !fileResolved()) { return null } - val file = DocumentFactory.newReadonlyDocument(new URI(fileUri.get)).asFile() + // infer schema from the first few lines of the file + val file = DocumentFactory.newReadonlyDocument(new URI(fileName.get)).asFile() implicit object CustomFormat extends DefaultCSVFormat { override val delimiter: Char = customDelimiter.get.charAt(0) } @@ -108,6 +96,7 @@ class CSVOldScanSourceOpDesc extends ScanSourceOpDesc { ) ) .build() + } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/csvOld/CSVOldScanSourceOpExec.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/csvOld/CSVOldScanSourceOpExec.scala index 7a92698d426..28241ea3cf5 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/csvOld/CSVOldScanSourceOpExec.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/csvOld/CSVOldScanSourceOpExec.scala @@ -4,27 +4,22 @@ import com.github.tototoshi.csv.{CSVReader, DefaultCSVFormat} import edu.uci.ics.amber.core.executor.SourceOperatorExecutor import edu.uci.ics.amber.core.storage.DocumentFactory import edu.uci.ics.amber.core.tuple.{Attribute, AttributeTypeUtils, Schema, TupleLike} -import edu.uci.ics.amber.operator.source.scan.FileDecodingMethod +import edu.uci.ics.amber.util.JSONUtils.objectMapper import java.net.URI import scala.collection.compat.immutable.ArraySeq class CSVOldScanSourceOpExec private[csvOld] ( - fileUri: String, - fileEncoding: FileDecodingMethod, - limit: Option[Int], - offset: Option[Int], - customDelimiter: Option[String], - hasHeader: Boolean, - schemaFunc: () => Schema + descString: String ) extends SourceOperatorExecutor { - var schema: Schema = _ + val desc: CSVOldScanSourceOpDesc = + objectMapper.readValue(descString, classOf[CSVOldScanSourceOpDesc]) var reader: CSVReader = _ var rows: Iterator[Seq[String]] = _ - + val schema: Schema = desc.sourceSchema() override def produceTuple(): Iterator[TupleLike] = { - var tuples = rows + val tuples = rows .map(fields => try { val parsedFields: Array[Any] = AttributeTypeUtils.parseFields( @@ -40,24 +35,27 @@ class CSVOldScanSourceOpExec private[csvOld] ( ) .filter(tuple => tuple != null) - if (limit.isDefined) tuples = tuples.take(limit.get) - tuples + if (desc.limit.isDefined) + tuples.take(desc.limit.get) + else { + tuples + } } override def open(): Unit = { - schema = schemaFunc() implicit object CustomFormat extends DefaultCSVFormat { - override val delimiter: Char = customDelimiter.get.charAt(0) + override val delimiter: Char = desc.customDelimiter.get.charAt(0) } - val filePath = DocumentFactory.newReadonlyDocument(new URI(fileUri)).asFile().toPath - reader = CSVReader.open(filePath.toString, fileEncoding.getCharset.name())(CustomFormat) + val filePath = DocumentFactory.newReadonlyDocument(new URI(desc.fileName.get)).asFile().toPath + reader = CSVReader.open(filePath.toString, desc.fileEncoding.getCharset.name())(CustomFormat) // skip line if this worker reads the start of a file, and the file has a header line - val startOffset = offset.getOrElse(0) + (if (hasHeader) 1 else 0) - + val startOffset = desc.offset.getOrElse(0) + (if (desc.hasHeader) 1 else 0) rows = reader.iterator.drop(startOffset) } override def close(): Unit = { - reader.close() + if (reader != null) { + reader.close() + } } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/json/JSONLScanSourceOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/json/JSONLScanSourceOpDesc.scala index 0be43a62c39..9a9deee9bbc 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/json/JSONLScanSourceOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/json/JSONLScanSourceOpDesc.scala @@ -2,15 +2,15 @@ package edu.uci.ics.amber.operator.source.scan.json import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.fasterxml.jackson.databind.JsonNode -import edu.uci.ics.amber.core.executor.OpExecInitInfo +import edu.uci.ics.amber.core.executor.OpExecWithClassName import edu.uci.ics.amber.core.storage.DocumentFactory import edu.uci.ics.amber.core.storage.model.DatasetFileDocument import edu.uci.ics.amber.core.tuple.AttributeTypeUtils.inferSchemaFromRows import edu.uci.ics.amber.core.tuple.{Attribute, Schema} +import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import edu.uci.ics.amber.core.workflow.{PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.operator.source.scan.ScanSourceOpDesc import edu.uci.ics.amber.util.JSONUtils.{JSONToMap, objectMapper} -import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import java.io._ import java.net.URI @@ -38,56 +38,30 @@ class JSONLScanSourceOpDesc extends ScanSourceOpDesc { workflowId: WorkflowIdentity, executionId: ExecutionIdentity ): PhysicalOp = { - val stream = DocumentFactory.newReadonlyDocument(new URI(fileUri.get)).asInputStream() - // count lines and partition the task to each worker - val reader = new BufferedReader( - new InputStreamReader(stream, fileEncoding.getCharset) - ) - val offsetValue = offset.getOrElse(0) - var lines = reader.lines().iterator().asScala.drop(offsetValue) - if (limit.isDefined) lines = lines.take(limit.get) - val count: Int = lines.map(_ => 1).sum - reader.close() PhysicalOp .sourcePhysicalOp( workflowId, executionId, operatorIdentifier, - OpExecInitInfo((idx, workerCount) => { - val startOffset: Int = offsetValue + count / workerCount * idx - val endOffset: Int = - offsetValue + (if (idx != workerCount - 1) count / workerCount * (idx + 1) - else count) - new JSONLScanSourceOpExec( - fileUri.get, - fileEncoding, - startOffset, - endOffset, - flatten, - schemaFunc = () => inferSchema() - ) - }) + OpExecWithClassName( + "edu.uci.ics.amber.operator.source.scan.json.JSONLScanSourceOpExec", + objectMapper.writeValueAsString(this) + ) ) .withInputPorts(operatorInfo.inputPorts) .withOutputPorts(operatorInfo.outputPorts) .withParallelizable(true) .withPropagateSchema( - SchemaPropagationFunc(_ => Map(operatorInfo.outputPorts.head.id -> inferSchema())) + SchemaPropagationFunc(_ => Map(operatorInfo.outputPorts.head.id -> sourceSchema())) ) } - /** - * Infer Texera.Schema based on the top few lines of data. - * - * @return Texera.Schema build for this operator - */ - @Override - def inferSchema(): Schema = { - if (fileUri.isEmpty) { + override def sourceSchema(): Schema = { + if (!fileResolved()) { return null } - val stream = DocumentFactory.newReadonlyDocument(new URI(fileUri.get)).asInputStream() + val stream = DocumentFactory.newReadonlyDocument(new URI(fileName.get)).asInputStream() val reader = new BufferedReader(new InputStreamReader(stream, fileEncoding.getCharset)) var fieldNames = Set[String]() @@ -132,6 +106,6 @@ class JSONLScanSourceOpDesc extends ScanSourceOpDesc { .map(i => new Attribute(sortedFieldNames(i), attributeTypes(i))) ) .build() - } + } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/json/JSONLScanSourceOpExec.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/json/JSONLScanSourceOpExec.scala index 221fdd57050..ec4490d6964 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/json/JSONLScanSourceOpExec.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/json/JSONLScanSourceOpExec.scala @@ -3,8 +3,7 @@ package edu.uci.ics.amber.operator.source.scan.json import edu.uci.ics.amber.core.executor.SourceOperatorExecutor import edu.uci.ics.amber.core.storage.DocumentFactory import edu.uci.ics.amber.core.tuple.AttributeTypeUtils.parseField -import edu.uci.ics.amber.core.tuple.{Schema, TupleLike} -import edu.uci.ics.amber.operator.source.scan.FileDecodingMethod +import edu.uci.ics.amber.core.tuple.TupleLike import edu.uci.ics.amber.operator.source.scan.json.JSONUtil.JSONToMap import edu.uci.ics.amber.util.JSONUtils.objectMapper @@ -14,21 +13,20 @@ import scala.jdk.CollectionConverters.IteratorHasAsScala import scala.util.{Failure, Success, Try} class JSONLScanSourceOpExec private[json] ( - fileUri: String, - fileEncoding: FileDecodingMethod, - startOffset: Int, - endOffset: Int, - flatten: Boolean, - schemaFunc: () => Schema + descString: String, + idx: Int = 0, + workerCount: Int = 1 ) extends SourceOperatorExecutor { - private var schema: Schema = _ + private val desc: JSONLScanSourceOpDesc = + objectMapper.readValue(descString, classOf[JSONLScanSourceOpDesc]) private var rows: Iterator[String] = _ private var reader: BufferedReader = _ override def produceTuple(): Iterator[TupleLike] = { rows.flatMap { line => Try { - val data = JSONToMap(objectMapper.readTree(line), flatten).withDefaultValue(null) + val schema = desc.sourceSchema() + val data = JSONToMap(objectMapper.readTree(line), desc.flatten).withDefaultValue(null) val fields = schema.getAttributeNames.map { fieldName => parseField(data(fieldName), schema.getAttribute(fieldName).getType) } @@ -41,14 +39,23 @@ class JSONLScanSourceOpExec private[json] ( } override def open(): Unit = { - schema = schemaFunc() + val stream = DocumentFactory.newReadonlyDocument(new URI(desc.fileName.get)).asInputStream() + // count lines and partition the task to each worker reader = new BufferedReader( - new InputStreamReader( - DocumentFactory.newReadonlyDocument(new URI(fileUri)).asInputStream(), - fileEncoding.getCharset - ) + new InputStreamReader(stream, desc.fileEncoding.getCharset) ) - rows = reader.lines().iterator().asScala.slice(startOffset, endOffset) + val offsetValue = desc.offset.getOrElse(0) + var lines = reader.lines().iterator().asScala.drop(offsetValue) + if (desc.limit.isDefined) lines = lines.take(desc.limit.get) + val (it1, it2) = lines.duplicate + val count: Int = it1.map(_ => 1).sum + + val startOffset: Int = offsetValue + count / workerCount * idx + val endOffset: Int = + offsetValue + (if (idx != workerCount - 1) count / workerCount * (idx + 1) + else count) + + rows = it2.iterator.slice(startOffset, endOffset) } override def close(): Unit = reader.close() diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/text/TextInputSourceOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/text/TextInputSourceOpDesc.scala index e3aaec7da42..bdb59fff827 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/text/TextInputSourceOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/text/TextInputSourceOpDesc.scala @@ -2,12 +2,13 @@ package edu.uci.ics.amber.operator.source.scan.text import com.fasterxml.jackson.annotation.JsonProperty import com.kjetland.jackson.jsonSchema.annotations.{JsonSchemaInject, JsonSchemaTitle} -import edu.uci.ics.amber.core.executor.OpExecInitInfo +import edu.uci.ics.amber.core.executor.OpExecWithClassName import edu.uci.ics.amber.core.tuple.{Attribute, Schema} import edu.uci.ics.amber.core.workflow.{PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.metadata.annotations.UIWidget import edu.uci.ics.amber.operator.source.SourceOperatorDescriptor +import edu.uci.ics.amber.util.JSONUtils.objectMapper import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import edu.uci.ics.amber.core.workflow.OutputPort @@ -26,8 +27,9 @@ class TextInputSourceOpDesc extends SourceOperatorDescriptor with TextSourceOpDe workflowId, executionId, operatorIdentifier, - OpExecInitInfo((_, _) => - new TextInputSourceOpExec(attributeType, textInput, fileScanLimit, fileScanOffset) + OpExecWithClassName( + "edu.uci.ics.amber.operator.source.scan.text.TextInputSourceOpExec", + objectMapper.writeValueAsString(this) ) ) .withInputPorts(operatorInfo.inputPorts) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/text/TextInputSourceOpExec.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/text/TextInputSourceOpExec.scala index 104c9cae558..76260167adc 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/text/TextInputSourceOpExec.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/text/TextInputSourceOpExec.scala @@ -4,27 +4,26 @@ import edu.uci.ics.amber.core.executor.SourceOperatorExecutor import edu.uci.ics.amber.core.tuple.AttributeTypeUtils.parseField import edu.uci.ics.amber.core.tuple.TupleLike import edu.uci.ics.amber.operator.source.scan.FileAttributeType +import edu.uci.ics.amber.util.JSONUtils.objectMapper class TextInputSourceOpExec private[text] ( - fileAttributeType: FileAttributeType, - textInput: String, - fileScanLimit: Option[Int] = None, - fileScanOffset: Option[Int] = None + descString: String ) extends SourceOperatorExecutor { - + private val desc: TextInputSourceOpDesc = + objectMapper.readValue(descString, classOf[TextInputSourceOpDesc]) override def produceTuple(): Iterator[TupleLike] = { - (if (fileAttributeType.isSingle) { - Iterator(textInput) + (if (desc.attributeType.isSingle) { + Iterator(desc.textInput) } else { - textInput.linesIterator.slice( - fileScanOffset.getOrElse(0), - fileScanOffset.getOrElse(0) + fileScanLimit.getOrElse(Int.MaxValue) + desc.textInput.linesIterator.slice( + desc.fileScanOffset.getOrElse(0), + desc.fileScanOffset.getOrElse(0) + desc.fileScanLimit.getOrElse(Int.MaxValue) ) }).map(line => - TupleLike(fileAttributeType match { + TupleLike(desc.attributeType match { case FileAttributeType.SINGLE_STRING => line case FileAttributeType.BINARY => line.getBytes - case _ => parseField(line, fileAttributeType.getType) + case _ => parseField(line, desc.attributeType.getType) }) ) } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/sql/SQLSourceOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/sql/SQLSourceOpDesc.scala index 1eff093a236..77113ff4660 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/sql/SQLSourceOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/sql/SQLSourceOpDesc.scala @@ -106,20 +106,7 @@ abstract class SQLSourceOpDesc extends SourceOperatorDescriptor { @BatchByColumn var interval = 0L - /** - * Make sure all the required parameters are not empty, - * then query the remote PostgreSQL server for the table schema - * - * @return Tuple.Schema - */ - override def sourceSchema(): Schema = { - if ( - this.host == null || this.port == null || this.database == null - || this.table == null || this.username == null || this.password == null - ) - return null - querySchema - } + override def sourceSchema(): Schema = querySchema // needs to define getters for sub classes to override Jackson Annotations def getKeywords: Option[String] = keywords @@ -131,7 +118,14 @@ abstract class SQLSourceOpDesc extends SourceOperatorDescriptor { * * @return Schema */ - protected def querySchema: Schema = { + private def querySchema: Schema = { + if ( + this.host == null || this.port == null || this.database == null + || this.table == null || this.username == null || this.password == null + ) { + return null + } + updatePort() val schemaBuilder = Schema.builder() try { diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/sql/SQLSourceOpExec.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/sql/SQLSourceOpExec.scala index 77bb30d7731..e232d13d254 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/sql/SQLSourceOpExec.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/sql/SQLSourceOpExec.scala @@ -3,31 +3,18 @@ package edu.uci.ics.amber.operator.source.sql import edu.uci.ics.amber.core.executor.SourceOperatorExecutor import edu.uci.ics.amber.core.tuple.AttributeTypeUtils.{parseField, parseTimestamp} import edu.uci.ics.amber.core.tuple._ +import edu.uci.ics.amber.util.JSONUtils.objectMapper import java.sql._ import scala.collection.mutable.ArrayBuffer import scala.util.control.Breaks.{break, breakable} -abstract class SQLSourceOpExec( - // source configs - table: String, - var curLimit: Option[Long], - var curOffset: Option[Long], - // progressiveness related - progressive: Option[Boolean], - batchByColumn: Option[String], - min: Option[String], - max: Option[String], - interval: Long, - // filter conditions: - keywordSearch: Boolean, - keywordSearchByColumn: String, - keywords: String, - schemaFunc: () => Schema -) extends SourceOperatorExecutor { - - // connection and query related +abstract class SQLSourceOpExec(descString: String) extends SourceOperatorExecutor { + val desc: SQLSourceOpDesc = objectMapper.readValue(descString, classOf[SQLSourceOpDesc]) var schema: Schema = _ + var curLimit: Option[Long] = None + var curOffset: Option[Long] = None + // connection and query related val tableNames: ArrayBuffer[String] = ArrayBuffer() var batchByAttribute: Option[Attribute] = None var connection: Connection = _ @@ -100,7 +87,7 @@ abstract class SQLSourceOpExec( // update the limit in order to adapt to progressive batches curLimit.foreach(limit => { if (limit > 0) { - curLimit = Option(limit - 1) + curLimit = Some(limit - 1) } }) return tuple @@ -141,18 +128,18 @@ abstract class SQLSourceOpExec( */ @throws[SQLException] override def open(): Unit = { - schema = schemaFunc() batchByAttribute = - if (progressive.getOrElse(false)) Option(schema.getAttribute(batchByColumn.get)) else None + if (desc.progressive.getOrElse(false)) Option(schema.getAttribute(desc.batchByColumn.get)) + else None connection = establishConn() // load user table names from the given database loadTableNames() // validates the input table name - if (!tableNames.contains(table)) - throw new RuntimeException("Can't find the given table `" + table + "`.") + if (!tableNames.contains(desc.table)) + throw new RuntimeException("Can't find the given table `" + desc.table + "`.") // load for batch column value boundaries used to split mini queries - if (progressive.getOrElse(false)) initBatchColumnBoundaries() + if (desc.progressive.getOrElse(false)) initBatchColumnBoundaries() } /** @@ -244,7 +231,7 @@ abstract class SQLSourceOpExec( } protected def addBaseSelect(queryBuilder: StringBuilder): Unit = { - queryBuilder ++= "\n" + "SELECT * FROM " + table + " where 1 = 1" + queryBuilder ++= "\n" + "SELECT * FROM " + desc.table + " where 1 = 1" } /** @@ -272,10 +259,10 @@ abstract class SQLSourceOpExec( case Some(attribute) => attribute.getType match { case AttributeType.INTEGER | AttributeType.LONG | AttributeType.TIMESTAMP => - nextLowerBound = curLowerBound.longValue + interval + nextLowerBound = curLowerBound.longValue + desc.interval isLastBatch = nextLowerBound.longValue >= upperBound.longValue case AttributeType.DOUBLE => - nextLowerBound = curLowerBound.doubleValue + interval + nextLowerBound = curLowerBound.doubleValue + desc.interval isLastBatch = nextLowerBound.doubleValue >= upperBound.doubleValue case AttributeType.BOOLEAN | AttributeType.STRING | AttributeType.ANY | _ => throw new IllegalArgumentException("Unexpected type: " + attribute.getType) @@ -289,7 +276,7 @@ abstract class SQLSourceOpExec( " < " + batchAttributeToString(nextLowerBound)) case None => throw new IllegalArgumentException( - "no valid batchByColumn to iterate: " + batchByColumn.getOrElse("") + "no valid batchByColumn to iterate: " + desc.batchByColumn.getOrElse("") ) } curLowerBound = nextLowerBound @@ -316,7 +303,7 @@ abstract class SQLSourceOpExec( } case None => throw new IllegalArgumentException( - "No valid batchByColumn to iterate: " + batchByColumn.getOrElse("") + "No valid batchByColumn to iterate: " + desc.batchByColumn.getOrElse("") ) } @@ -335,7 +322,7 @@ abstract class SQLSourceOpExec( case Some(attribute) => var result: Number = null val preparedStatement = connection.prepareStatement( - "SELECT " + side + "(" + attribute.getName + ") FROM " + table + ";" + "SELECT " + side + "(" + attribute.getName + ") FROM " + desc.table + ";" ) val resultSet = preparedStatement.executeQuery resultSet.next @@ -410,7 +397,7 @@ abstract class SQLSourceOpExec( addFilterConditions(queryBuilder) // add sliding window if progressive mode is enabled - if (progressive.getOrElse(false) && batchByColumn.isDefined && interval > 0L) + if (desc.progressive.getOrElse(false) && desc.batchByColumn.isDefined && desc.interval > 0L) addBatchSlidingWindow(queryBuilder) // add limit if provided @@ -422,7 +409,7 @@ abstract class SQLSourceOpExec( } // add fixed offset if not progressive - if (!progressive.getOrElse(false) && curOffset.isDefined) addOffset(queryBuilder) + if (!desc.progressive.getOrElse(false) && curOffset.isDefined) addOffset(queryBuilder) // end terminateSQL(queryBuilder) @@ -450,7 +437,12 @@ abstract class SQLSourceOpExec( var curIndex = 1 // fill up the keywords - if (keywordSearch && keywordSearchByColumn != null && keywords != null) { + val keywords = desc.keywords.orNull + if ( + desc.keywordSearch.getOrElse( + false + ) && desc.keywordSearchByColumn.orNull != null && keywords != null + ) { preparedStatement.setString(curIndex, keywords) curIndex += 1 } @@ -464,7 +456,7 @@ abstract class SQLSourceOpExec( } // fill up offset if progressive mode is not enabled - if (!progressive.getOrElse(false)) + if (!desc.progressive.getOrElse(false)) curOffset match { case Some(offset) => preparedStatement.setLong(curIndex, offset) @@ -488,28 +480,28 @@ abstract class SQLSourceOpExec( @throws[IllegalArgumentException] private def initBatchColumnBoundaries(): Unit = { // TODO: add interval - if (batchByAttribute.isDefined && min.isDefined && max.isDefined) { + if (batchByAttribute.isDefined && desc.min.isDefined && desc.max.isDefined) { - if (min.get.equalsIgnoreCase("auto")) curLowerBound = fetchBatchByBoundary("MIN") + if (desc.min.get.equalsIgnoreCase("auto")) curLowerBound = fetchBatchByBoundary("MIN") else batchByAttribute.get.getType match { - case AttributeType.TIMESTAMP => curLowerBound = parseTimestamp(min.get).getTime - case AttributeType.LONG => curLowerBound = min.get.toLong + case AttributeType.TIMESTAMP => curLowerBound = parseTimestamp(desc.min.get).getTime + case AttributeType.LONG => curLowerBound = desc.min.get.toLong case _ => throw new IllegalArgumentException(s"Unsupported type ${batchByAttribute.get.getType}") } - if (max.get.equalsIgnoreCase("auto")) upperBound = fetchBatchByBoundary("MAX") + if (desc.max.get.equalsIgnoreCase("auto")) upperBound = fetchBatchByBoundary("MAX") else batchByAttribute.get.getType match { - case AttributeType.TIMESTAMP => upperBound = parseTimestamp(max.get).getTime - case AttributeType.LONG => upperBound = max.get.toLong + case AttributeType.TIMESTAMP => upperBound = parseTimestamp(desc.max.get).getTime + case AttributeType.LONG => upperBound = desc.max.get.toLong case _ => throw new IllegalArgumentException(s"Unsupported type ${batchByAttribute.get.getType}") } } else { throw new IllegalArgumentException( - s"Missing required progressive configuration, $batchByAttribute, $min or $max." + s"Missing required progressive configuration, $batchByAttribute, $desc.min or $desc.max." ) } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/sql/asterixdb/AsterixDBSourceOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/sql/asterixdb/AsterixDBSourceOpDesc.scala index 8ab3249d909..6f688ae8e68 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/sql/asterixdb/AsterixDBSourceOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/sql/asterixdb/AsterixDBSourceOpDesc.scala @@ -7,23 +7,24 @@ import com.fasterxml.jackson.annotation.{ } import com.fasterxml.jackson.databind.annotation.JsonDeserialize import com.kjetland.jackson.jsonSchema.annotations.{JsonSchemaInject, JsonSchemaTitle} -import edu.uci.ics.amber.core.executor.OpExecInitInfo +import edu.uci.ics.amber.core.executor.OpExecWithClassName import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} import edu.uci.ics.amber.core.workflow.{PhysicalOp, SchemaPropagationFunc} +import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} +import edu.uci.ics.amber.core.workflow.OutputPort import edu.uci.ics.amber.operator.filter.FilterPredicate -import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.metadata.annotations.{ AutofillAttributeName, AutofillAttributeNameList, UIWidget } -import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} -import edu.uci.ics.amber.core.workflow.OutputPort +import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.source.sql.SQLSourceOpDesc import edu.uci.ics.amber.operator.source.sql.asterixdb.AsterixDBConnUtil.{ fetchDataTypeFields, queryAsterixDB } +import edu.uci.ics.amber.util.JSONUtils.objectMapper import kong.unirest.json.JSONObject @JsonIgnoreProperties(value = Array("username", "password")) @@ -97,32 +98,9 @@ class AsterixDBSourceOpDesc extends SQLSourceOpDesc { workflowId, executionId, this.operatorIdentifier, - OpExecInitInfo((_, _) => - new AsterixDBSourceOpExec( - host, - port, - database, - table, - limit, - offset, - progressive, - batchByColumn, - min, - max, - interval, - keywordSearch.getOrElse(false), - keywordSearchByColumn.orNull, - keywords.orNull, - geoSearch.getOrElse(false), - geoSearchByColumns, - geoSearchBoundingBox, - regexSearch.getOrElse(false), - regexSearchByColumn.orNull, - regex.orNull, - filterCondition.getOrElse(false), - filterPredicates, - () => sourceSchema() - ) + OpExecWithClassName( + "edu.uci.ics.amber.operator.source.sql.asterixdb.AsterixDBSourceOpExec", + objectMapper.writeValueAsString(this) ) ) .withInputPorts(operatorInfo.inputPorts) @@ -131,13 +109,6 @@ class AsterixDBSourceOpDesc extends SQLSourceOpDesc { SchemaPropagationFunc(_ => Map(operatorInfo.outputPorts.head.id -> sourceSchema())) ) - override def sourceSchema(): Schema = { - if (this.host == null || this.port == null || this.database == null || this.table == null) - return null - - querySchema - } - override def operatorInfo: OperatorInfo = OperatorInfo( "AsterixDB Source", @@ -149,7 +120,11 @@ class AsterixDBSourceOpDesc extends SQLSourceOpDesc { override def updatePort(): Unit = port = if (port.trim().equals("default")) "19002" else port - override def querySchema: Schema = { + override def sourceSchema(): Schema = { + if (this.host == null || this.port == null || this.database == null || this.table == null) { + return null + } + updatePort() val sb: Schema.Builder = Schema.builder() diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/sql/asterixdb/AsterixDBSourceOpExec.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/sql/asterixdb/AsterixDBSourceOpExec.scala index e950c65106d..88d49b052c7 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/sql/asterixdb/AsterixDBSourceOpExec.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/sql/asterixdb/AsterixDBSourceOpExec.scala @@ -2,13 +2,13 @@ package edu.uci.ics.amber.operator.source.sql.asterixdb import com.github.tototoshi.csv.CSVParser import edu.uci.ics.amber.core.tuple.AttributeTypeUtils.parseField -import edu.uci.ics.amber.core.tuple.{AttributeType, Schema, Tuple, TupleLike} -import edu.uci.ics.amber.operator.filter.FilterPredicate +import edu.uci.ics.amber.core.tuple.{AttributeType, Tuple, TupleLike} import edu.uci.ics.amber.operator.source.sql.SQLSourceOpExec import edu.uci.ics.amber.operator.source.sql.asterixdb.AsterixDBConnUtil.{ queryAsterixDB, updateAsterixDBVersionMapping } +import edu.uci.ics.amber.util.JSONUtils.objectMapper import java.sql._ import java.time.format.DateTimeFormatter @@ -17,44 +17,12 @@ import scala.util.control.Breaks.{break, breakable} import scala.util.{Failure, Success, Try} class AsterixDBSourceOpExec private[asterixdb] ( - host: String, - port: String, - database: String, - table: String, - limit: Option[Long], - offset: Option[Long], - progressive: Option[Boolean], - batchByColumn: Option[String], - min: Option[String], - max: Option[String], - interval: Long, - keywordSearch: Boolean, - keywordSearchByColumn: String, - keywords: String, - geoSearch: Boolean, - geoSearchByColumns: List[String], - geoSearchBoundingBox: List[String], - regexSearch: Boolean, - regexSearchByColumn: String, - regex: String, - filterCondition: Boolean, - filterPredicates: List[FilterPredicate], - schemaFunc: () => Schema -) extends SQLSourceOpExec( - table, - limit, - offset, - progressive, - batchByColumn, - min, - max, - interval, - keywordSearch, - keywordSearchByColumn, - keywords, - schemaFunc - ) { + descString: String +) extends SQLSourceOpExec(descString) { + override val desc: AsterixDBSourceOpDesc = + objectMapper.readValue(descString, classOf[AsterixDBSourceOpDesc]) + schema = desc.sourceSchema() // format Timestamp. TODO: move to some util package private val formatter: DateTimeFormatter = DateTimeFormatter.ISO_LOCAL_DATE_TIME.withZone(ZoneId.from(ZoneOffset.UTC)) @@ -64,7 +32,7 @@ class AsterixDBSourceOpExec private[asterixdb] ( override def open(): Unit = { // update AsterixDB API version upon open - updateAsterixDBVersionMapping(host, port) + updateAsterixDBVersionMapping(desc.host, desc.port) super.open() } @@ -133,7 +101,7 @@ class AsterixDBSourceOpExec private[asterixdb] ( curQueryString = if (hasNextQuery) generateSqlQuery else None curQueryString match { case Some(query) => - curResultIterator = queryAsterixDB(host, port, query) + curResultIterator = queryAsterixDB(desc.host, desc.port, query) break() case None => curResultIterator = None @@ -215,24 +183,26 @@ class AsterixDBSourceOpExec private[asterixdb] ( */ @throws[IllegalArgumentException] def addFilterConditions(queryBuilder: StringBuilder): Unit = { - if (keywordSearch) { + if (desc.keywordSearch.getOrElse(false)) { addKeywordSearch(queryBuilder) } - if (regexSearch) { + if (desc.regexSearch.getOrElse(false)) { addRegexSearch(queryBuilder) } - if (geoSearch) { + if (desc.geoSearch.getOrElse(false)) { addGeoSearch(queryBuilder) } - if (filterCondition) { + if (desc.filterCondition.getOrElse(false)) { addGeneralFilterCondition(queryBuilder) } } private def addKeywordSearch(queryBuilder: StringBuilder): Unit = { + val keywordSearchByColumn = desc.keywordSearchByColumn.orNull + val keywords = desc.keywords.orNull if (keywordSearchByColumn != null && keywords != null) { val columnType = schema.getAttribute(keywordSearchByColumn).getType if (columnType == AttributeType.STRING) { @@ -243,6 +213,8 @@ class AsterixDBSourceOpExec private[asterixdb] ( } private def addRegexSearch(queryBuilder: StringBuilder): Unit = { + val regexSearchByColumn = desc.regexSearchByColumn.orNull + val regex = desc.regex.orNull if (regexSearchByColumn != null && regex != null) { val regexColumnType = schema.getAttribute(regexSearchByColumn).getType if (regexColumnType == AttributeType.STRING) { @@ -256,17 +228,17 @@ class AsterixDBSourceOpExec private[asterixdb] ( private def addGeoSearch(queryBuilder: StringBuilder): Unit = { // geolocation must contain more than 1 points to from a rectangle or polygon - if (geoSearchBoundingBox.size > 1 && geoSearchByColumns.nonEmpty) { + if (desc.geoSearchBoundingBox.size > 1 && desc.geoSearchByColumns.nonEmpty) { val shape = { - val points = geoSearchBoundingBox.flatMap(s => s.split(",").map(sub => sub.toDouble)) - if (geoSearchBoundingBox.size == 2) { + val points = desc.geoSearchBoundingBox.flatMap(s => s.split(",").map(sub => sub.toDouble)) + if (desc.geoSearchBoundingBox.size == 2) { "create_rectangle(create_point(%.6f,%.6f), create_point(%.6f,%.6f))".format(points: _*) } else { "create_polygon([" + points.map(x => "%.6f".format(x)).mkString(",") + "])" } } queryBuilder ++= " AND (" - queryBuilder ++= geoSearchByColumns + queryBuilder ++= desc.geoSearchByColumns .map { attr => s"spatial_intersect($attr, $shape)" } .mkString(" OR ") queryBuilder ++= " ) " @@ -274,8 +246,8 @@ class AsterixDBSourceOpExec private[asterixdb] ( } private def addGeneralFilterCondition(queryBuilder: StringBuilder): Unit = { - if (filterCondition && filterPredicates.nonEmpty) { - val filterString = filterPredicates + if (desc.filterCondition.getOrElse(false) && desc.filterPredicates.nonEmpty) { + val filterString = desc.filterPredicates .map(p => s"(${p.attribute} ${p.condition.getName} ${p.value})") .mkString(" OR ") queryBuilder ++= s" AND ( $filterString ) " @@ -292,9 +264,9 @@ class AsterixDBSourceOpExec private[asterixdb] ( batchByAttribute match { case Some(attribute) => val resultString = queryAsterixDB( - host, - port, - "SELECT " + side + "(" + attribute.getName + ") FROM " + database + "." + table + ";" + desc.host, + desc.port, + "SELECT " + side + "(" + attribute.getName + ") FROM " + desc.database + "." + desc.table + ";" ).get.next().toString.stripLineEnd Try( parseField( @@ -317,7 +289,7 @@ class AsterixDBSourceOpExec private[asterixdb] ( .map((entry: (String, Int)) => { s"if_missing(${entry._1},null) field_${entry._2}" }) - .mkString(", ")} FROM $database.$table WHERE 1 = 1 " + .mkString(", ")} FROM $desc.database.$desc.table WHERE 1 = 1 " } override def addLimit(queryBuilder: StringBuilder): Unit = { @@ -342,7 +314,7 @@ class AsterixDBSourceOpExec private[asterixdb] ( } case None => throw new IllegalArgumentException( - "No valid batchByColumn to iterate: " + batchByColumn.getOrElse("") + "No valid batchByColumn to iterate: " + desc.batchByColumn.getOrElse("") ) } } @@ -353,7 +325,8 @@ class AsterixDBSourceOpExec private[asterixdb] ( */ override protected def loadTableNames(): Unit = { // fetch for all tables, it is also equivalent to a health check - val tables = queryAsterixDB(host, port, "select `DatasetName` from Metadata.`Dataset`;") + val tables = + queryAsterixDB(desc.host, desc.port, "select `DatasetName` from Metadata.`Dataset`;") tables.get.foreach(table => { tableNames.append(table.toString.stripPrefix("\"").stripLineEnd.stripSuffix("\"")) }) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/sql/mysql/MySQLSourceOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/sql/mysql/MySQLSourceOpDesc.scala index 073e900e658..fbc583d8b68 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/sql/mysql/MySQLSourceOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/sql/mysql/MySQLSourceOpDesc.scala @@ -1,10 +1,11 @@ package edu.uci.ics.amber.operator.source.sql.mysql -import edu.uci.ics.amber.core.executor.OpExecInitInfo +import edu.uci.ics.amber.core.executor.OpExecWithClassName import edu.uci.ics.amber.core.workflow.{PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.source.sql.SQLSourceOpDesc import edu.uci.ics.amber.operator.source.sql.mysql.MySQLConnUtil.connect +import edu.uci.ics.amber.util.JSONUtils.objectMapper import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import edu.uci.ics.amber.core.workflow.OutputPort @@ -21,26 +22,9 @@ class MySQLSourceOpDesc extends SQLSourceOpDesc { workflowId, executionId, this.operatorIdentifier, - OpExecInitInfo((_, _) => - new MySQLSourceOpExec( - host, - port, - database, - table, - username, - password, - limit, - offset, - progressive, - batchByColumn, - min, - max, - interval, - keywordSearch.getOrElse(false), - keywordSearchByColumn.orNull, - keywords.orNull, - () => sourceSchema() - ) + OpExecWithClassName( + "edu.uci.ics.amber.operator.source.sql.mysql.MySQLSourceOpExec", + objectMapper.writeValueAsString(this) ) ) .withInputPorts(operatorInfo.inputPorts) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/sql/mysql/MySQLSourceOpExec.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/sql/mysql/MySQLSourceOpExec.scala index 56e6e25d008..9b42ad43ac0 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/sql/mysql/MySQLSourceOpExec.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/sql/mysql/MySQLSourceOpExec.scala @@ -1,53 +1,31 @@ package edu.uci.ics.amber.operator.source.sql.mysql -import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} +import edu.uci.ics.amber.core.tuple.AttributeType import edu.uci.ics.amber.operator.source.sql.SQLSourceOpExec import edu.uci.ics.amber.operator.source.sql.mysql.MySQLConnUtil.connect +import edu.uci.ics.amber.util.JSONUtils.objectMapper import java.sql._ class MySQLSourceOpExec private[mysql] ( - host: String, - port: String, - database: String, - table: String, - username: String, - password: String, - limit: Option[Long], - offset: Option[Long], - progressive: Option[Boolean], - batchByColumn: Option[String], - min: Option[String], - max: Option[String], - interval: Long, - keywordSearch: Boolean, - keywordSearchByColumn: String, - keywords: String, - schemaFunc: () => Schema -) extends SQLSourceOpExec( - table, - limit, - offset, - progressive, - batchByColumn, - min, - max, - interval, - keywordSearch, - keywordSearchByColumn, - keywords, - schemaFunc - ) { - + descString: String +) extends SQLSourceOpExec(descString) { + override val desc: MySQLSourceOpDesc = + objectMapper.readValue(descString, classOf[MySQLSourceOpDesc]) + schema = desc.sourceSchema() val FETCH_TABLE_NAMES_SQL = "SELECT table_name FROM information_schema.tables WHERE table_schema = ?;" @throws[SQLException] - override def establishConn(): Connection = connect(host, port, database, username, password) + override def establishConn(): Connection = + connect(desc.host, desc.port, desc.database, desc.username, desc.password) @throws[RuntimeException] override def addFilterConditions(queryBuilder: StringBuilder): Unit = { - if (keywordSearch && keywordSearchByColumn != null && keywords != null) { + val keywordSearchByColumn = desc.keywordSearchByColumn.orNull + if ( + desc.keywordSearch.getOrElse(false) && keywordSearchByColumn != null && desc.keywords != null + ) { val columnType = schema.getAttribute(keywordSearchByColumn).getType if (columnType == AttributeType.STRING) @@ -61,7 +39,7 @@ class MySQLSourceOpExec private[mysql] ( @throws[SQLException] override protected def loadTableNames(): Unit = { val preparedStatement = connection.prepareStatement(FETCH_TABLE_NAMES_SQL) - preparedStatement.setString(1, database) + preparedStatement.setString(1, desc.database) val resultSet = preparedStatement.executeQuery while ({ resultSet.next diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/sql/postgresql/PostgreSQLSourceOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/sql/postgresql/PostgreSQLSourceOpDesc.scala index 4abc00c2c6b..529ec85d971 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/sql/postgresql/PostgreSQLSourceOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/sql/postgresql/PostgreSQLSourceOpDesc.scala @@ -3,12 +3,13 @@ package edu.uci.ics.amber.operator.source.sql.postgresql import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.fasterxml.jackson.databind.annotation.JsonDeserialize import com.kjetland.jackson.jsonSchema.annotations.{JsonSchemaInject, JsonSchemaTitle} -import edu.uci.ics.amber.core.executor.OpExecInitInfo +import edu.uci.ics.amber.core.executor.OpExecWithClassName import edu.uci.ics.amber.core.workflow.{PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.metadata.annotations.UIWidget import edu.uci.ics.amber.operator.source.sql.SQLSourceOpDesc import edu.uci.ics.amber.operator.source.sql.postgresql.PostgreSQLConnUtil.connect +import edu.uci.ics.amber.util.JSONUtils.objectMapper import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import edu.uci.ics.amber.core.workflow.OutputPort @@ -34,26 +35,9 @@ class PostgreSQLSourceOpDesc extends SQLSourceOpDesc { workflowId, executionId, operatorIdentifier, - OpExecInitInfo((_, _) => - new PostgreSQLSourceOpExec( - host, - port, - database, - table, - username, - password, - limit, - offset, - progressive, - batchByColumn, - min, - max, - interval, - keywordSearch.getOrElse(false), - keywordSearchByColumn.orNull, - keywords.orNull, - () => sourceSchema() - ) + OpExecWithClassName( + "edu.uci.ics.amber.operator.source.sql.postgresql.PostgreSQLSourceOpExec", + objectMapper.writeValueAsString(this) ) ) .withInputPorts(operatorInfo.inputPorts) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/sql/postgresql/PostgreSQLSourceOpExec.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/sql/postgresql/PostgreSQLSourceOpExec.scala index 73223d3e5ab..05a2a29067b 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/sql/postgresql/PostgreSQLSourceOpExec.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/sql/postgresql/PostgreSQLSourceOpExec.scala @@ -1,52 +1,30 @@ package edu.uci.ics.amber.operator.source.sql.postgresql -import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} +import edu.uci.ics.amber.core.tuple.AttributeType import edu.uci.ics.amber.operator.source.sql.SQLSourceOpExec import edu.uci.ics.amber.operator.source.sql.postgresql.PostgreSQLConnUtil.connect +import edu.uci.ics.amber.util.JSONUtils.objectMapper import java.sql._ -class PostgreSQLSourceOpExec private[postgresql] ( - host: String, - port: String, - database: String, - table: String, - username: String, - password: String, - limit: Option[Long], - offset: Option[Long], - progressive: Option[Boolean], - batchByColumn: Option[String], - min: Option[String], - max: Option[String], - interval: Long, - keywordSearch: Boolean, - keywordSearchByColumn: String, - keywords: String, - schemaFunc: () => Schema -) extends SQLSourceOpExec( - table, - limit, - offset, - progressive, - batchByColumn, - min, - max, - interval, - keywordSearch, - keywordSearchByColumn, - keywords, - schemaFunc - ) { +class PostgreSQLSourceOpExec private[postgresql] (descString: String) + extends SQLSourceOpExec(descString) { + override val desc: PostgreSQLSourceOpDesc = + objectMapper.readValue(descString, classOf[PostgreSQLSourceOpDesc]) + schema = desc.sourceSchema() val FETCH_TABLE_NAMES_SQL = "SELECT table_name FROM information_schema.tables WHERE table_type='BASE TABLE';" @throws[SQLException] - override def establishConn(): Connection = connect(host, port, database, username, password) + override def establishConn(): Connection = + connect(desc.host, desc.port, desc.database, desc.username, desc.password) @throws[RuntimeException] override def addFilterConditions(queryBuilder: StringBuilder): Unit = { - if (keywordSearch && keywordSearchByColumn != null && keywords != null) { + val keywordSearchByColumn = desc.keywordSearchByColumn.orNull + if ( + desc.keywordSearch.getOrElse(false) && keywordSearchByColumn != null && desc.keywords != null + ) { val columnType = schema.getAttribute(keywordSearchByColumn).getType if (columnType == AttributeType.STRING) { diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/split/SplitOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/split/SplitOpDesc.scala index 1aab480e7d5..134af1029cd 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/split/SplitOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/split/SplitOpDesc.scala @@ -2,11 +2,12 @@ package edu.uci.ics.amber.operator.split import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.google.common.base.Preconditions -import edu.uci.ics.amber.core.executor.OpExecInitInfo +import edu.uci.ics.amber.core.executor.OpExecWithClassName import edu.uci.ics.amber.core.tuple.Schema import edu.uci.ics.amber.core.workflow.{PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.operator.LogicalOp import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} +import edu.uci.ics.amber.util.JSONUtils.objectMapper import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} @@ -30,7 +31,10 @@ class SplitOpDesc extends LogicalOp { workflowId, executionId, operatorIdentifier, - OpExecInitInfo((_, _) => new SplitOpExec(k, seed)) + OpExecWithClassName( + "edu.uci.ics.amber.operator.split.SplitOpExec", + objectMapper.writeValueAsString(this) + ) ) .withInputPorts(operatorInfo.inputPorts) .withOutputPorts(operatorInfo.outputPorts) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/split/SplitOpExec.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/split/SplitOpExec.scala index 848982a1e20..a0f3544e8de 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/split/SplitOpExec.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/split/SplitOpExec.scala @@ -2,22 +2,22 @@ package edu.uci.ics.amber.operator.split import edu.uci.ics.amber.core.executor.OperatorExecutor import edu.uci.ics.amber.core.tuple.{Tuple, TupleLike} +import edu.uci.ics.amber.util.JSONUtils.objectMapper import edu.uci.ics.amber.core.workflow.PortIdentity import scala.util.Random class SplitOpExec( - k: Int, - seed: Int + descString: String ) extends OperatorExecutor { - - lazy val random = new Random(seed) + val desc: SplitOpDesc = objectMapper.readValue(descString, classOf[SplitOpDesc]) + lazy val random = new Random(desc.seed) override def processTupleMultiPort( tuple: Tuple, port: Int ): Iterator[(TupleLike, Option[PortIdentity])] = { - val isTraining = random.nextInt(100) < k + val isTraining = random.nextInt(100) < desc.k // training output port: 0, testing output port: 1 val port = if (isTraining) PortIdentity(0) else PortIdentity(1) Iterator.single((tuple, Some(port))) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/symmetricDifference/SymmetricDifferenceOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/symmetricDifference/SymmetricDifferenceOpDesc.scala index 94a2ac3b852..e77663fdf0b 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/symmetricDifference/SymmetricDifferenceOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/symmetricDifference/SymmetricDifferenceOpDesc.scala @@ -1,7 +1,7 @@ package edu.uci.ics.amber.operator.symmetricDifference import com.google.common.base.Preconditions -import edu.uci.ics.amber.core.executor.OpExecInitInfo +import edu.uci.ics.amber.core.executor.OpExecWithClassName import edu.uci.ics.amber.core.tuple.Schema import edu.uci.ics.amber.core.workflow.{HashPartition, PhysicalOp} import edu.uci.ics.amber.operator.LogicalOp @@ -21,7 +21,9 @@ class SymmetricDifferenceOpDesc extends LogicalOp { workflowId, executionId, operatorIdentifier, - OpExecInitInfo((_, _) => new SymmetricDifferenceOpExec()) + OpExecWithClassName( + "edu.uci.ics.amber.operator.symmetricDifference.SymmetricDifferenceOpExec" + ) ) .withInputPorts(operatorInfo.inputPorts) .withOutputPorts(operatorInfo.outputPorts) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/typecasting/TypeCastingOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/typecasting/TypeCastingOpDesc.scala index eca814f491c..b52f299c0ff 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/typecasting/TypeCastingOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/typecasting/TypeCastingOpDesc.scala @@ -2,11 +2,12 @@ package edu.uci.ics.amber.operator.typecasting import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle -import edu.uci.ics.amber.core.executor.OpExecInitInfo +import edu.uci.ics.amber.core.executor.OpExecWithClassName import edu.uci.ics.amber.core.tuple.{AttributeTypeUtils, Schema} import edu.uci.ics.amber.core.workflow.{PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.operator.map.MapOpDesc import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} +import edu.uci.ics.amber.util.JSONUtils.objectMapper import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} @@ -27,7 +28,10 @@ class TypeCastingOpDesc extends MapOpDesc { workflowId, executionId, operatorIdentifier, - OpExecInitInfo((_, _) => new TypeCastingOpExec(typeCastingUnits)) + OpExecWithClassName( + "edu.uci.ics.amber.operator.typecasting.TypeCastingOpExec", + objectMapper.writeValueAsString(this) + ) ) .withInputPorts(operatorInfo.inputPorts) .withOutputPorts(operatorInfo.outputPorts) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/typecasting/TypeCastingOpExec.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/typecasting/TypeCastingOpExec.scala index 998d0504583..821c76c02cc 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/typecasting/TypeCastingOpExec.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/typecasting/TypeCastingOpExec.scala @@ -2,14 +2,19 @@ package edu.uci.ics.amber.operator.typecasting import edu.uci.ics.amber.core.tuple.{AttributeTypeUtils, Tuple, TupleLike} import edu.uci.ics.amber.operator.map.MapOpExec +import edu.uci.ics.amber.util.JSONUtils.objectMapper + +class TypeCastingOpExec(descString: String) extends MapOpExec { + + private val desc: TypeCastingOpDesc = + objectMapper.readValue(descString, classOf[TypeCastingOpDesc]) -class TypeCastingOpExec(typeCastingUnits: List[TypeCastingUnit]) extends MapOpExec { this.setMapFunc(castTuple) private def castTuple(tuple: Tuple): TupleLike = AttributeTypeUtils.tupleCasting( tuple, - typeCastingUnits + desc.typeCastingUnits .map(typeCastingUnit => typeCastingUnit.attribute -> typeCastingUnit.resultType) .toMap ) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/java/JavaUDFOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/java/JavaUDFOpDesc.scala index a3fa40a4e01..9fe0089c4ba 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/java/JavaUDFOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/java/JavaUDFOpDesc.scala @@ -3,7 +3,7 @@ package edu.uci.ics.amber.operator.udf.java import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.google.common.base.Preconditions import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle -import edu.uci.ics.amber.core.executor.OpExecInitInfo +import edu.uci.ics.amber.core.executor.OpExecWithCode import edu.uci.ics.amber.core.tuple.{Attribute, Schema} import edu.uci.ics.amber.core.workflow.{ PartitionInfo, @@ -95,7 +95,7 @@ class JavaUDFOpDesc extends LogicalOp { workflowId, executionId, operatorIdentifier, - OpExecInitInfo(code, "java") + OpExecWithCode(code, "java") ) .withDerivePartition(_ => UnknownPartition()) .withInputPorts(operatorInfo.inputPorts) @@ -111,7 +111,7 @@ class JavaUDFOpDesc extends LogicalOp { workflowId, executionId, operatorIdentifier, - OpExecInitInfo(code, "java") + OpExecWithCode(code, "java") ) .withDerivePartition(_ => UnknownPartition()) .withInputPorts(operatorInfo.inputPorts) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/DualInputPortsPythonUDFOpDescV2.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/DualInputPortsPythonUDFOpDescV2.scala index 24d6bb62549..985fa54fede 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/DualInputPortsPythonUDFOpDescV2.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/DualInputPortsPythonUDFOpDescV2.scala @@ -3,7 +3,7 @@ package edu.uci.ics.amber.operator.udf.python import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.google.common.base.Preconditions import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle -import edu.uci.ics.amber.core.executor.OpExecInitInfo +import edu.uci.ics.amber.core.executor.OpExecWithCode import edu.uci.ics.amber.core.tuple.{Attribute, Schema} import edu.uci.ics.amber.core.workflow.{PhysicalOp, SchemaPropagationFunc, UnknownPartition} import edu.uci.ics.amber.operator.LogicalOp @@ -70,7 +70,7 @@ class DualInputPortsPythonUDFOpDescV2 extends LogicalOp { workflowId, executionId, operatorIdentifier, - OpExecInitInfo(code, "python") + OpExecWithCode(code, "python") ) .withDerivePartition(_ => UnknownPartition()) .withParallelizable(true) @@ -88,7 +88,7 @@ class DualInputPortsPythonUDFOpDescV2 extends LogicalOp { workflowId, executionId, operatorIdentifier, - OpExecInitInfo(code, "python") + OpExecWithCode(code, "python") ) .withDerivePartition(_ => UnknownPartition()) .withParallelizable(false) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/PythonUDFOpDescV2.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/PythonUDFOpDescV2.scala index 3ce08b1510a..1f9b69eb326 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/PythonUDFOpDescV2.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/PythonUDFOpDescV2.scala @@ -3,7 +3,7 @@ package edu.uci.ics.amber.operator.udf.python import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.google.common.base.Preconditions import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle -import edu.uci.ics.amber.core.executor.OpExecInitInfo +import edu.uci.ics.amber.core.executor.OpExecWithCode import edu.uci.ics.amber.core.tuple.{Attribute, Schema} import edu.uci.ics.amber.core.workflow.{ PartitionInfo, @@ -104,7 +104,7 @@ class PythonUDFOpDescV2 extends LogicalOp { workflowId, executionId, operatorIdentifier, - OpExecInitInfo(code, "python") + OpExecWithCode(code, "python") ) .withDerivePartition(_ => UnknownPartition()) .withInputPorts(operatorInfo.inputPorts) @@ -120,7 +120,7 @@ class PythonUDFOpDescV2 extends LogicalOp { workflowId, executionId, operatorIdentifier, - OpExecInitInfo(code, "python") + OpExecWithCode(code, "python") ) .withDerivePartition(_ => UnknownPartition()) .withInputPorts(operatorInfo.inputPorts) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/source/PythonUDFSourceOpDescV2.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/source/PythonUDFSourceOpDescV2.scala index 3086d8e6762..086b014ea68 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/source/PythonUDFSourceOpDescV2.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/source/PythonUDFSourceOpDescV2.scala @@ -2,7 +2,7 @@ package edu.uci.ics.amber.operator.udf.python.source import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle -import edu.uci.ics.amber.core.executor.OpExecInitInfo +import edu.uci.ics.amber.core.executor.OpExecWithCode import edu.uci.ics.amber.core.tuple.{Attribute, Schema} import edu.uci.ics.amber.core.workflow.{PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} @@ -40,7 +40,6 @@ class PythonUDFSourceOpDescV2 extends SourceOperatorDescriptor { workflowId: WorkflowIdentity, executionId: ExecutionIdentity ): PhysicalOp = { - val exec = OpExecInitInfo(code, "python") require(workers >= 1, "Need at least 1 worker.") val func = SchemaPropagationFunc { _: Map[PortIdentity, Schema] => @@ -49,7 +48,7 @@ class PythonUDFSourceOpDescV2 extends SourceOperatorDescriptor { } val physicalOp = PhysicalOp - .sourcePhysicalOp(workflowId, executionId, operatorIdentifier, exec) + .sourcePhysicalOp(workflowId, executionId, operatorIdentifier, OpExecWithCode(code, "python")) .withInputPorts(operatorInfo.inputPorts) .withOutputPorts(operatorInfo.outputPorts) .withIsOneToManyOp(true) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/r/RUDFOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/r/RUDFOpDesc.scala index 94f31d02f05..42445e21e16 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/r/RUDFOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/r/RUDFOpDesc.scala @@ -3,7 +3,7 @@ package edu.uci.ics.amber.operator.udf.r import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.google.common.base.Preconditions import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle -import edu.uci.ics.amber.core.executor.OpExecInitInfo +import edu.uci.ics.amber.core.executor.OpExecWithCode import edu.uci.ics.amber.core.tuple.{Attribute, Schema} import edu.uci.ics.amber.core.workflow.{ PartitionInfo, @@ -97,7 +97,7 @@ class RUDFOpDesc extends LogicalOp { workflowId, executionId, operatorIdentifier, - OpExecInitInfo(code, r_operator_type) + OpExecWithCode(code, r_operator_type) ) .withDerivePartition(_ => UnknownPartition()) .withInputPorts(operatorInfo.inputPorts) @@ -113,7 +113,7 @@ class RUDFOpDesc extends LogicalOp { workflowId, executionId, operatorIdentifier, - OpExecInitInfo(code, r_operator_type) + OpExecWithCode(code, r_operator_type) ) .withDerivePartition(_ => UnknownPartition()) .withInputPorts(operatorInfo.inputPorts) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/r/RUDFSourceOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/r/RUDFSourceOpDesc.scala index afb2e2524e4..0653228a145 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/r/RUDFSourceOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/r/RUDFSourceOpDesc.scala @@ -2,7 +2,7 @@ package edu.uci.ics.amber.operator.udf.r import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle -import edu.uci.ics.amber.core.executor.OpExecInitInfo +import edu.uci.ics.amber.core.executor.OpExecWithCode import edu.uci.ics.amber.core.tuple.{Attribute, Schema} import edu.uci.ics.amber.core.workflow.{PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} @@ -49,7 +49,6 @@ class RUDFSourceOpDesc extends SourceOperatorDescriptor { executionId: ExecutionIdentity ): PhysicalOp = { val rOperatorType = if (useTupleAPI) "r-tuple" else "r-table" - val exec = OpExecInitInfo(code, rOperatorType) require(workers >= 1, "Need at least 1 worker.") val func = SchemaPropagationFunc { _: Map[PortIdentity, Schema] => @@ -58,7 +57,12 @@ class RUDFSourceOpDesc extends SourceOperatorDescriptor { } val physicalOp = PhysicalOp - .sourcePhysicalOp(workflowId, executionId, operatorIdentifier, exec) + .sourcePhysicalOp( + workflowId, + executionId, + operatorIdentifier, + OpExecWithCode(code, rOperatorType) + ) .withInputPorts(operatorInfo.inputPorts) .withOutputPorts(operatorInfo.outputPorts) .withIsOneToManyOp(true) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/union/UnionOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/union/UnionOpDesc.scala index 6e6efcc1d0c..7e75c24e7f6 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/union/UnionOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/union/UnionOpDesc.scala @@ -1,7 +1,7 @@ package edu.uci.ics.amber.operator.union import com.google.common.base.Preconditions -import edu.uci.ics.amber.core.executor.OpExecInitInfo +import edu.uci.ics.amber.core.executor.OpExecWithClassName import edu.uci.ics.amber.core.tuple.Schema import edu.uci.ics.amber.core.workflow.PhysicalOp import edu.uci.ics.amber.operator.LogicalOp @@ -20,7 +20,7 @@ class UnionOpDesc extends LogicalOp { workflowId, executionId, operatorIdentifier, - OpExecInitInfo((_, _) => new UnionOpExec()) + OpExecWithClassName("edu.uci.ics.amber.operator.union.UnionOpExec") ) .withInputPorts(operatorInfo.inputPorts) .withOutputPorts(operatorInfo.outputPorts) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/unneststring/UnnestStringOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/unneststring/UnnestStringOpDesc.scala index 26b36b410dc..5ac736490da 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/unneststring/UnnestStringOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/unneststring/UnnestStringOpDesc.scala @@ -2,12 +2,13 @@ package edu.uci.ics.amber.operator.unneststring import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.google.common.base.Preconditions -import edu.uci.ics.amber.core.executor.OpExecInitInfo +import edu.uci.ics.amber.core.executor.OpExecWithClassName import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} import edu.uci.ics.amber.core.workflow.{PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.operator.flatmap.FlatMapOpDesc import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName +import edu.uci.ics.amber.util.JSONUtils.objectMapper import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} @@ -44,7 +45,10 @@ class UnnestStringOpDesc extends FlatMapOpDesc { workflowId, executionId, operatorIdentifier, - OpExecInitInfo((_, _) => new UnnestStringOpExec(attribute, delimiter)) + OpExecWithClassName( + "edu.uci.ics.amber.operator.unneststring.UnnestStringOpExec", + objectMapper.writeValueAsString(this) + ) ) .withInputPorts(operatorInfo.inputPorts) .withOutputPorts(operatorInfo.outputPorts) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/unneststring/UnnestStringOpExec.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/unneststring/UnnestStringOpExec.scala index 09962de8d45..084d1dc0836 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/unneststring/UnnestStringOpExec.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/unneststring/UnnestStringOpExec.scala @@ -2,14 +2,16 @@ package edu.uci.ics.amber.operator.unneststring import edu.uci.ics.amber.core.tuple.{Tuple, TupleLike} import edu.uci.ics.amber.operator.flatmap.FlatMapOpExec +import edu.uci.ics.amber.util.JSONUtils.objectMapper -class UnnestStringOpExec(attributeName: String, delimiter: String) extends FlatMapOpExec { - +class UnnestStringOpExec(descString: String) extends FlatMapOpExec { + private val desc: UnnestStringOpDesc = + objectMapper.readValue(descString, classOf[UnnestStringOpDesc]) setFlatMapFunc(splitByDelimiter) private def splitByDelimiter(tuple: Tuple): Iterator[TupleLike] = { - delimiter.r - .split(tuple.getField(attributeName).toString) + desc.delimiter.r + .split(tuple.getField(desc.attribute).toString) .filter(_.nonEmpty) .iterator .map(split => TupleLike(tuple.getFields ++ Seq(split))) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/htmlviz/HtmlVizOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/htmlviz/HtmlVizOpDesc.scala index 5f2696c3cb9..2bf48a41ca8 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/htmlviz/HtmlVizOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/htmlviz/HtmlVizOpDesc.scala @@ -2,12 +2,13 @@ package edu.uci.ics.amber.operator.visualization.htmlviz import com.fasterxml.jackson.annotation.JsonProperty import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle -import edu.uci.ics.amber.core.executor.OpExecInitInfo +import edu.uci.ics.amber.core.executor.OpExecWithClassName import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} import edu.uci.ics.amber.core.workflow.{PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.operator.LogicalOp import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} +import edu.uci.ics.amber.util.JSONUtils.objectMapper import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} @@ -30,7 +31,10 @@ class HtmlVizOpDesc extends LogicalOp { workflowId, executionId, operatorIdentifier, - OpExecInitInfo((_, _) => new HtmlVizOpExec(htmlContentAttrName)) + OpExecWithClassName( + "edu.uci.ics.amber.operator.visualization.htmlviz.HtmlVizOpExec", + objectMapper.writeValueAsString(this) + ) ) .withInputPorts(operatorInfo.inputPorts) .withOutputPorts(operatorInfo.outputPorts) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/htmlviz/HtmlVizOpExec.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/htmlviz/HtmlVizOpExec.scala index 1177cef30ab..e269803a83d 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/htmlviz/HtmlVizOpExec.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/htmlviz/HtmlVizOpExec.scala @@ -2,11 +2,13 @@ package edu.uci.ics.amber.operator.visualization.htmlviz import edu.uci.ics.amber.core.executor.OperatorExecutor import edu.uci.ics.amber.core.tuple.{Tuple, TupleLike} +import edu.uci.ics.amber.util.JSONUtils.objectMapper /** * HTML Visualization operator to render any given HTML code */ -class HtmlVizOpExec(htmlContentAttrName: String) extends OperatorExecutor { +class HtmlVizOpExec(descString: String) extends OperatorExecutor { + private val desc: HtmlVizOpDesc = objectMapper.readValue(descString, classOf[HtmlVizOpDesc]) override def processTuple(tuple: Tuple, port: Int): Iterator[TupleLike] = - Iterator(TupleLike(tuple.getField[Any](htmlContentAttrName))) + Iterator(TupleLike(tuple.getField[Any](desc.htmlContentAttrName))) } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/urlviz/UrlVizOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/urlviz/UrlVizOpDesc.scala index 7fe381c2d28..9df368b0ec9 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/urlviz/UrlVizOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/urlviz/UrlVizOpDesc.scala @@ -2,7 +2,7 @@ package edu.uci.ics.amber.operator.visualization.urlviz import com.fasterxml.jackson.annotation.JsonProperty import com.kjetland.jackson.jsonSchema.annotations.{JsonSchemaInject, JsonSchemaTitle} -import edu.uci.ics.amber.core.executor.OpExecInitInfo +import edu.uci.ics.amber.core.executor.OpExecWithClassName import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} import edu.uci.ics.amber.core.workflow.{PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.operator.LogicalOp @@ -10,6 +10,7 @@ import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdenti import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName +import edu.uci.ics.amber.util.JSONUtils.objectMapper import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode /** @@ -30,7 +31,7 @@ class UrlVizOpDesc extends LogicalOp { @JsonProperty(required = true) @JsonSchemaTitle("URL content") @AutofillAttributeName - private val urlContentAttrName: String = "" + val urlContentAttrName: String = "" override def getPhysicalOp( workflowId: WorkflowIdentity, @@ -41,7 +42,10 @@ class UrlVizOpDesc extends LogicalOp { workflowId, executionId, operatorIdentifier, - OpExecInitInfo((_, _) => new UrlVizOpExec(urlContentAttrName)) + OpExecWithClassName( + "edu.uci.ics.amber.operator.visualization.urlviz.UrlVizOpExec", + objectMapper.writeValueAsString(this) + ) ) .withInputPorts(operatorInfo.inputPorts) .withOutputPorts(operatorInfo.outputPorts) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/urlviz/UrlVizOpExec.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/urlviz/UrlVizOpExec.scala index 32f05d7d6ef..88e09839861 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/urlviz/UrlVizOpExec.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/urlviz/UrlVizOpExec.scala @@ -2,19 +2,20 @@ package edu.uci.ics.amber.operator.visualization.urlviz import edu.uci.ics.amber.core.executor.OperatorExecutor import edu.uci.ics.amber.core.tuple.{Tuple, TupleLike} +import edu.uci.ics.amber.util.JSONUtils.objectMapper /** * URL Visualization operator to render any given URL link */ -class UrlVizOpExec(urlContentAttrName: String) extends OperatorExecutor { - +class UrlVizOpExec(descString: String) extends OperatorExecutor { + private val desc: UrlVizOpDesc = objectMapper.readValue(descString, classOf[UrlVizOpDesc]) override def processTuple(tuple: Tuple, port: Int): Iterator[TupleLike] = { val iframe = s""" | | | diff --git a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/dictionary/DictionaryMatcherOpExecSpec.scala b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/dictionary/DictionaryMatcherOpExecSpec.scala index 6ffa101eb87..f952d847e7f 100644 --- a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/dictionary/DictionaryMatcherOpExecSpec.scala +++ b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/dictionary/DictionaryMatcherOpExecSpec.scala @@ -1,6 +1,7 @@ package edu.uci.ics.amber.operator.dictionary import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema, SchemaEnforceable, Tuple} +import edu.uci.ics.amber.util.JSONUtils.objectMapper import org.scalatest.BeforeAndAfter import org.scalatest.flatspec.AnyFlatSpec @@ -23,14 +24,13 @@ class DictionaryMatcherOpExecSpec extends AnyFlatSpec with BeforeAndAfter { .build() var opExec: DictionaryMatcherOpExec = _ - var opDesc: DictionaryMatcherOpDesc = _ + val opDesc: DictionaryMatcherOpDesc = new DictionaryMatcherOpDesc() var outputSchema: Schema = _ val dictionaryScan = "nice a a person" val dictionarySubstring = "nice a a person and good" val dictionaryConjunction = "a person is nice" before { - opDesc = new DictionaryMatcherOpDesc() opDesc.attribute = "field1" opDesc.dictionary = dictionaryScan opDesc.resultAttribute = "matched" @@ -39,7 +39,7 @@ class DictionaryMatcherOpExecSpec extends AnyFlatSpec with BeforeAndAfter { } it should "open" in { - opExec = new DictionaryMatcherOpExec(opDesc.attribute, opDesc.dictionary, opDesc.matchingType) + opExec = new DictionaryMatcherOpExec(objectMapper.writeValueAsString(opDesc)) opExec.open() assert(opExec.dictionaryEntries != null) } @@ -48,8 +48,8 @@ class DictionaryMatcherOpExecSpec extends AnyFlatSpec with BeforeAndAfter { * Test cases that all Matching Types should match the query */ it should "match a tuple if present in the given dictionary entry when matching type is SCANBASED" in { - opExec = - new DictionaryMatcherOpExec(opDesc.attribute, opDesc.dictionary, MatchingType.SCANBASED) + opDesc.matchingType = MatchingType.SCANBASED + opExec = new DictionaryMatcherOpExec(objectMapper.writeValueAsString(opDesc)) opExec.open() val processedTuple = opExec.processTuple(tuple, 0).next() assert( @@ -60,6 +60,7 @@ class DictionaryMatcherOpExecSpec extends AnyFlatSpec with BeforeAndAfter { it should "match a tuple if present in the given dictionary entry when matching type is SUBSTRING" in { opDesc.matchingType = MatchingType.SUBSTRING + opExec = new DictionaryMatcherOpExec(objectMapper.writeValueAsString(opDesc)) opExec.open() val processedTuple = opExec.processTuple(tuple, 0).next() assert( @@ -70,6 +71,7 @@ class DictionaryMatcherOpExecSpec extends AnyFlatSpec with BeforeAndAfter { it should "match a tuple if present in the given dictionary entry when matching type is CONJUNCTION_INDEXBASED" in { opDesc.matchingType = MatchingType.CONJUNCTION_INDEXBASED + opExec = new DictionaryMatcherOpExec(objectMapper.writeValueAsString(opDesc)) opExec.open() val processedTuple = opExec.processTuple(tuple, 0).next() assert( @@ -82,8 +84,9 @@ class DictionaryMatcherOpExecSpec extends AnyFlatSpec with BeforeAndAfter { * Test cases that SCANBASED and SUBSTRING Matching Types should fail to match a query */ it should "not match a tuple if not present in the given dictionary entry when matching type is SCANBASED and not exact match" in { - opExec = - new DictionaryMatcherOpExec(opDesc.attribute, dictionaryConjunction, MatchingType.SCANBASED) + opDesc.dictionary = dictionaryConjunction + opDesc.matchingType = MatchingType.SCANBASED + opExec = new DictionaryMatcherOpExec(objectMapper.writeValueAsString(opDesc)) opExec.open() val processedTuple = opExec.processTuple(tuple, 0).next() assert( @@ -98,6 +101,7 @@ class DictionaryMatcherOpExecSpec extends AnyFlatSpec with BeforeAndAfter { it should "not match a tuple if the given dictionary entry doesn't contain all the tuple when the matching type is SUBSTRING" in { opDesc.dictionary = dictionaryConjunction opDesc.matchingType = MatchingType.SUBSTRING + opExec = new DictionaryMatcherOpExec(objectMapper.writeValueAsString(opDesc)) opExec.open() val processedTuple = opExec.processTuple(tuple, 0).next() assert( @@ -110,11 +114,9 @@ class DictionaryMatcherOpExecSpec extends AnyFlatSpec with BeforeAndAfter { } it should "match a tuple if present in the given dictionary entry when matching type is CONJUNCTION_INDEXBASED even with different order" in { - opExec = new DictionaryMatcherOpExec( - opDesc.attribute, - dictionaryConjunction, - MatchingType.CONJUNCTION_INDEXBASED - ) + opDesc.dictionary = dictionaryConjunction + opDesc.matchingType = MatchingType.CONJUNCTION_INDEXBASED + opExec = new DictionaryMatcherOpExec(objectMapper.writeValueAsString(opDesc)) opExec.open() val processedTuple = opExec.processTuple(tuple, 0).next() assert( @@ -130,8 +132,9 @@ class DictionaryMatcherOpExecSpec extends AnyFlatSpec with BeforeAndAfter { * Test cases that only SUBSTRING Matching Type should match the query */ it should "not match a tuple if not present in the given dictionary entry when matching type is SCANBASED when the entry contains more text" in { - opExec = - new DictionaryMatcherOpExec(opDesc.attribute, dictionarySubstring, MatchingType.SCANBASED) + opDesc.dictionary = dictionarySubstring + opDesc.matchingType = MatchingType.SCANBASED + opExec = new DictionaryMatcherOpExec(objectMapper.writeValueAsString(opDesc)) opExec.open() val processedTuple = opExec.processTuple(tuple, 0).next() assert( @@ -146,6 +149,7 @@ class DictionaryMatcherOpExecSpec extends AnyFlatSpec with BeforeAndAfter { it should "not match a tuple if not present in the given dictionary entry when matching type is CONJUNCTION_INDEXBASED when the entry contains more text" in { opDesc.dictionary = dictionarySubstring opDesc.matchingType = MatchingType.CONJUNCTION_INDEXBASED + opExec = new DictionaryMatcherOpExec(objectMapper.writeValueAsString(opDesc)) opExec.open() val processedTuple = opExec.processTuple(tuple, 0).next() assert( @@ -158,8 +162,9 @@ class DictionaryMatcherOpExecSpec extends AnyFlatSpec with BeforeAndAfter { } it should "match a tuple if not present in the given dictionary entry when matching type is SUBSTRING when the entry contains more text" in { - opExec = - new DictionaryMatcherOpExec(opDesc.attribute, dictionarySubstring, MatchingType.SUBSTRING) + opDesc.dictionary = dictionarySubstring + opDesc.matchingType = MatchingType.SUBSTRING + opExec = new DictionaryMatcherOpExec(objectMapper.writeValueAsString(opDesc)) opExec.open() val processedTuple = opExec.processTuple(tuple, 0).next() assert( diff --git a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/filter/SpecializedFilterOpExecSpec.scala b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/filter/SpecializedFilterOpExecSpec.scala index 2826630cee3..a17642c8286 100644 --- a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/filter/SpecializedFilterOpExecSpec.scala +++ b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/filter/SpecializedFilterOpExecSpec.scala @@ -1,12 +1,13 @@ package edu.uci.ics.amber.operator.filter import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema, Tuple} +import edu.uci.ics.amber.util.JSONUtils.objectMapper import org.scalatest.BeforeAndAfter import org.scalatest.flatspec.AnyFlatSpec class SpecializedFilterOpExecSpec extends AnyFlatSpec with BeforeAndAfter { val inputPort: Int = 0 - + val opDesc: SpecializedFilterOpDesc = new SpecializedFilterOpDesc() val tuplesWithOneFieldNull: Iterable[Tuple] = AttributeType .values() @@ -44,40 +45,31 @@ class SpecializedFilterOpExecSpec extends AnyFlatSpec with BeforeAndAfter { .build() it should "open and close" in { - val opExec = new SpecializedFilterOpExec(List()) - opExec.open() - opExec.close() - } - - it should "throw when predicates is null" in { - val opExec = new SpecializedFilterOpExec(null) + opDesc.predicates = List() + val opExec = new SpecializedFilterOpExec(objectMapper.writeValueAsString(opDesc)) opExec.open() - assertThrows[NullPointerException] { - opExec.processTuple(allNullTuple, inputPort) - } opExec.close() } it should "do nothing when predicates is an empty list" in { - val opExec = new SpecializedFilterOpExec(List()) + opDesc.predicates = List() + val opExec = new SpecializedFilterOpExec(objectMapper.writeValueAsString(opDesc)) opExec.open() assert(opExec.processTuple(allNullTuple, inputPort).isEmpty) opExec.close() } it should "not have is_null comparisons be affected by values" in { - val opExec = new SpecializedFilterOpExec( - List(new FilterPredicate("string", ComparisonType.IS_NULL, "value")) - ) + opDesc.predicates = List(new FilterPredicate("string", ComparisonType.IS_NULL, "value")) + val opExec = new SpecializedFilterOpExec(objectMapper.writeValueAsString(opDesc)) opExec.open() assert(opExec.processTuple(allNullTuple, inputPort).nonEmpty) opExec.close() } it should "not have is_not_null comparisons be affected by values" in { - val opExec = new SpecializedFilterOpExec( - List(new FilterPredicate("string", ComparisonType.IS_NOT_NULL, "value")) - ) + opDesc.predicates = List(new FilterPredicate("string", ComparisonType.IS_NOT_NULL, "value")) + val opExec = new SpecializedFilterOpExec(objectMapper.writeValueAsString(opDesc)) opExec.open() assert(opExec.processTuple(allNullTuple, inputPort).isEmpty) opExec.close() @@ -88,11 +80,9 @@ class SpecializedFilterOpExecSpec extends AnyFlatSpec with BeforeAndAfter { .map(nullTuple => { val attributes = nullTuple.getSchema.getAttributes assert(attributes.length == 1) - - val opExec = new SpecializedFilterOpExec( + opDesc.predicates = List(new FilterPredicate(attributes.head.getName, ComparisonType.IS_NULL, null)) - ) - + val opExec = new SpecializedFilterOpExec(objectMapper.writeValueAsString(opDesc)) opExec.open() assert(opExec.processTuple(nullTuple, inputPort).nonEmpty) opExec.close() @@ -100,18 +90,16 @@ class SpecializedFilterOpExecSpec extends AnyFlatSpec with BeforeAndAfter { } it should "filter out non null tuples when filtering is_null" in { - val opExec = new SpecializedFilterOpExec( - List(new FilterPredicate("string", ComparisonType.IS_NULL, "value")) - ) + opDesc.predicates = List(new FilterPredicate("string", ComparisonType.IS_NULL, "value")) + val opExec = new SpecializedFilterOpExec(objectMapper.writeValueAsString(opDesc)) opExec.open() assert(opExec.processTuple(nonNullTuple, inputPort).isEmpty) opExec.close() } it should "output non null tuples when filter is_not_null" in { - val opExec = new SpecializedFilterOpExec( - List(new FilterPredicate("string", ComparisonType.IS_NOT_NULL, "value")) - ) + opDesc.predicates = List(new FilterPredicate("string", ComparisonType.IS_NOT_NULL, "value")) + val opExec = new SpecializedFilterOpExec(objectMapper.writeValueAsString(opDesc)) opExec.open() assert(opExec.processTuple(nonNullTuple, inputPort).nonEmpty) opExec.close() @@ -122,11 +110,9 @@ class SpecializedFilterOpExecSpec extends AnyFlatSpec with BeforeAndAfter { .map(nullTuple => { val attributes = nullTuple.getSchema.getAttributes assert(attributes.length == 1) - - val opExec = new SpecializedFilterOpExec( + opDesc.predicates = List(new FilterPredicate(attributes.head.getName, ComparisonType.IS_NOT_NULL, null)) - ) - + val opExec = new SpecializedFilterOpExec(objectMapper.writeValueAsString(opDesc)) opExec.open() assert(opExec.processTuple(nullTuple, inputPort).isEmpty) opExec.close() diff --git a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/hashJoin/HashJoinOpSpec.scala b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/hashJoin/HashJoinOpSpec.scala index b7ecb183fcc..2049d89f7f4 100644 --- a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/hashJoin/HashJoinOpSpec.scala +++ b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/hashJoin/HashJoinOpSpec.scala @@ -12,6 +12,7 @@ import edu.uci.ics.amber.core.tuple.{ TupleLike } import edu.uci.ics.amber.operator.hashJoin.HashJoinBuildOpExec +import edu.uci.ics.amber.util.JSONUtils.objectMapper class HashJoinOpSpec extends AnyFlatSpec with BeforeAndAfter { val build: Int = 0 val probe: Int = 1 @@ -51,10 +52,11 @@ class HashJoinOpSpec extends AnyFlatSpec with BeforeAndAfter { opDesc = new HashJoinOpDesc[String]() opDesc.buildAttributeName = "build_1" opDesc.probeAttributeName = "probe_1" + opDesc.joinType = JoinType.INNER val inputSchemas = Array(schema("build"), schema("probe")) val outputSchema = opDesc.getOutputSchema(inputSchemas) - buildOpExec = new HashJoinBuildOpExec[String]("build_1") + buildOpExec = new HashJoinBuildOpExec[String](objectMapper.writeValueAsString(opDesc)) buildOpExec.open() (0 to 7).map(i => { @@ -67,10 +69,7 @@ class HashJoinOpSpec extends AnyFlatSpec with BeforeAndAfter { buildOpExec.onFinish(build) assert(buildOpOutputIterator.hasNext) - probeOpExec = new HashJoinProbeOpExec[String]( - "probe_1", - JoinType.INNER - ) + probeOpExec = new HashJoinProbeOpExec[String](objectMapper.writeValueAsString(opDesc)) probeOpExec.open() while (buildOpOutputIterator.hasNext) { @@ -109,10 +108,11 @@ class HashJoinOpSpec extends AnyFlatSpec with BeforeAndAfter { opDesc = new HashJoinOpDesc[String]() opDesc.buildAttributeName = "same" opDesc.probeAttributeName = "same" + opDesc.joinType = JoinType.INNER val inputSchemas = Array(schema("same", 1), schema("same", 2)) val outputSchema = opDesc.getOutputSchema(inputSchemas) - buildOpExec = new HashJoinBuildOpExec[String]("same") + buildOpExec = new HashJoinBuildOpExec[String](objectMapper.writeValueAsString(opDesc)) buildOpExec.open() (0 to 7).map(i => { @@ -124,11 +124,7 @@ class HashJoinOpSpec extends AnyFlatSpec with BeforeAndAfter { buildOpExec.onFinish(build) assert(buildOpOutputIterator.hasNext) - probeOpExec = new HashJoinProbeOpExec[String]( - "same", - JoinType.INNER - ) - + probeOpExec = new HashJoinProbeOpExec[String](objectMapper.writeValueAsString(opDesc)) probeOpExec.open() while (buildOpOutputIterator.hasNext) { @@ -166,10 +162,11 @@ class HashJoinOpSpec extends AnyFlatSpec with BeforeAndAfter { opDesc = new HashJoinOpDesc[String]() opDesc.buildAttributeName = "same" opDesc.probeAttributeName = "same" + opDesc.joinType = JoinType.FULL_OUTER val inputSchemas = Array(schema("same", 1), schema("same", 2)) val outputSchema = opDesc.getOutputSchema(inputSchemas) - buildOpExec = new HashJoinBuildOpExec[String]("same") + buildOpExec = new HashJoinBuildOpExec[String](objectMapper.writeValueAsString(opDesc)) buildOpExec.open() (0 to 7).map(i => { @@ -181,11 +178,7 @@ class HashJoinOpSpec extends AnyFlatSpec with BeforeAndAfter { buildOpExec.onFinish(build) assert(buildOpOutputIterator.hasNext) - probeOpExec = new HashJoinProbeOpExec[String]( - "same", - JoinType.FULL_OUTER - ) - + probeOpExec = new HashJoinProbeOpExec[String](objectMapper.writeValueAsString(opDesc)) probeOpExec.open() while (buildOpOutputIterator.hasNext) { diff --git a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/intervalJoin/IntervalOpExecSpec.scala b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/intervalJoin/IntervalOpExecSpec.scala index c21d2308791..72c062ed319 100644 --- a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/intervalJoin/IntervalOpExecSpec.scala +++ b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/intervalJoin/IntervalOpExecSpec.scala @@ -16,6 +16,7 @@ import edu.uci.ics.amber.core.tuple.{ Tuple, TupleLike } +import edu.uci.ics.amber.util.JSONUtils.objectMapper class IntervalOpExecSpec extends AnyFlatSpec with BeforeAndAfter { val left: Int = 0 val right: Int = 1 @@ -222,14 +223,7 @@ class IntervalOpExecSpec extends AnyFlatSpec with BeforeAndAfter { timeIntervalType ) val outputSchema = opDesc.getOutputSchema(inputSchemas) - val opExec = new IntervalJoinOpExec( - leftAttributeName = leftKey, - rightAttributeName = rightKey, - includeLeftBound = includeLeftBound, - includeRightBound = includeRightBound, - constant = intervalConstant, - timeIntervalType = Some(timeIntervalType) - ) + val opExec = new IntervalJoinOpExec(objectMapper.writeValueAsString(opDesc)) opExec.open() counter = 0 var leftIndex: Int = 0 @@ -400,15 +394,13 @@ class IntervalOpExecSpec extends AnyFlatSpec with BeforeAndAfter { } it should "work with Double value int [] interval" in { - val opExec = new IntervalJoinOpExec( - leftAttributeName = "point_1", - rightAttributeName = "range_1", - includeLeftBound = true, - includeRightBound = true, - constant = 3, - timeIntervalType = Option(TimeIntervalType.DAY) - ) - + opDesc.leftAttributeName = "point_1" + opDesc.rightAttributeName = "range_1" + opDesc.includeLeftBound = true + opDesc.includeRightBound = true + opDesc.constant = 3 + opDesc.timeIntervalType = Option(TimeIntervalType.DAY) + val opExec = new IntervalJoinOpExec(objectMapper.writeValueAsString(opDesc)) opExec.open() counter = 0 val pointList: Array[Double] = Array(1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1, 8.1, 9.1, 10.1) diff --git a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/keywordSearch/KeywordSearchOpExecSpec.scala b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/keywordSearch/KeywordSearchOpExecSpec.scala index 50dcfc33515..c0e12804e7c 100644 --- a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/keywordSearch/KeywordSearchOpExecSpec.scala +++ b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/keywordSearch/KeywordSearchOpExecSpec.scala @@ -1,11 +1,13 @@ package edu.uci.ics.amber.operator.keywordSearch import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema, Tuple} +import edu.uci.ics.amber.util.JSONUtils.objectMapper import org.scalatest.BeforeAndAfter import org.scalatest.flatspec.AnyFlatSpec class KeywordSearchOpExecSpec extends AnyFlatSpec with BeforeAndAfter { val inputPort: Int = 0 + val opDesc: KeywordSearchOpDesc = new KeywordSearchOpDesc() val schema: Schema = Schema .builder() @@ -35,7 +37,9 @@ class KeywordSearchOpExecSpec extends AnyFlatSpec with BeforeAndAfter { ) it should "find exact match with single number" in { - val opExec = new KeywordSearchOpExec("text", "3") + opDesc.attribute = "text" + opDesc.keyword = "3" + val opExec = new KeywordSearchOpExec(objectMapper.writeValueAsString(opDesc)) opExec.open() val results = testData.filter(t => opExec.processTuple(t, inputPort).nonEmpty) assert(results.length == 1) @@ -44,7 +48,9 @@ class KeywordSearchOpExecSpec extends AnyFlatSpec with BeforeAndAfter { } it should "find exact phrase match" in { - val opExec = new KeywordSearchOpExec("text", "\"3 stars\"") + opDesc.attribute = "text" + opDesc.keyword = "\"3 stars\"" + val opExec = new KeywordSearchOpExec(objectMapper.writeValueAsString(opDesc)) opExec.open() val results = testData.filter(t => opExec.processTuple(t, inputPort).nonEmpty) assert(results.length == 1) @@ -53,7 +59,9 @@ class KeywordSearchOpExecSpec extends AnyFlatSpec with BeforeAndAfter { } it should "find all occurrences of Trump" in { - val opExec = new KeywordSearchOpExec("text", "Trump") + opDesc.attribute = "text" + opDesc.keyword = "Trump" + val opExec = new KeywordSearchOpExec(objectMapper.writeValueAsString(opDesc)) opExec.open() val results = testData.filter(t => opExec.processTuple(t, inputPort).nonEmpty) assert(results.length == 2) @@ -62,7 +70,9 @@ class KeywordSearchOpExecSpec extends AnyFlatSpec with BeforeAndAfter { } it should "find all occurrences of Biden" in { - val opExec = new KeywordSearchOpExec("text", "Biden") + opDesc.attribute = "text" + opDesc.keyword = "Biden" + val opExec = new KeywordSearchOpExec(objectMapper.writeValueAsString(opDesc)) opExec.open() val results = testData.filter(t => opExec.processTuple(t, inputPort).nonEmpty) assert(results.length == 1) @@ -71,7 +81,9 @@ class KeywordSearchOpExecSpec extends AnyFlatSpec with BeforeAndAfter { } it should "find records containing both Trump AND Biden" in { - val opExec = new KeywordSearchOpExec("text", "Trump AND Biden") + opDesc.attribute = "text" + opDesc.keyword = "Trump AND Biden" + val opExec = new KeywordSearchOpExec(objectMapper.writeValueAsString(opDesc)) opExec.open() val results = testData.filter(t => opExec.processTuple(t, inputPort).nonEmpty) assert(results.length == 1) @@ -80,7 +92,9 @@ class KeywordSearchOpExecSpec extends AnyFlatSpec with BeforeAndAfter { } it should "find no matches for exact phrase 'Trump AND Biden'" in { - val opExec = new KeywordSearchOpExec("text", "\"Trump AND Biden\"") + opDesc.attribute = "text" + opDesc.keyword = "\"Trump AND Biden\"" + val opExec = new KeywordSearchOpExec(objectMapper.writeValueAsString(opDesc)) opExec.open() val results = testData.filter(t => opExec.processTuple(t, inputPort).hasNext) assert(results.isEmpty) @@ -88,7 +102,9 @@ class KeywordSearchOpExecSpec extends AnyFlatSpec with BeforeAndAfter { } it should "find no matches for partial word 'ell'" in { - val opExec = new KeywordSearchOpExec("text", "ell") + opDesc.attribute = "text" + opDesc.keyword = "ell" + val opExec = new KeywordSearchOpExec(objectMapper.writeValueAsString(opDesc)) opExec.open() val results = testData.filter(t => opExec.processTuple(t, inputPort).hasNext) assert(results.isEmpty) @@ -96,7 +112,9 @@ class KeywordSearchOpExecSpec extends AnyFlatSpec with BeforeAndAfter { } it should "find exact match for word 'the'" in { - val opExec = new KeywordSearchOpExec("text", "the") + opDesc.attribute = "text" + opDesc.keyword = "the" + val opExec = new KeywordSearchOpExec(objectMapper.writeValueAsString(opDesc)) opExec.open() val results = testData.filter(t => opExec.processTuple(t, inputPort).hasNext) assert(results.length == 1) @@ -105,7 +123,9 @@ class KeywordSearchOpExecSpec extends AnyFlatSpec with BeforeAndAfter { } it should "find exact match for word 'an'" in { - val opExec = new KeywordSearchOpExec("text", "an") + opDesc.attribute = "text" + opDesc.keyword = "an" + val opExec = new KeywordSearchOpExec(objectMapper.writeValueAsString(opDesc)) opExec.open() val results = testData.filter(t => opExec.processTuple(t, inputPort).hasNext) assert(results.length == 1) @@ -114,7 +134,9 @@ class KeywordSearchOpExecSpec extends AnyFlatSpec with BeforeAndAfter { } it should "find exact match for word 'to'" in { - val opExec = new KeywordSearchOpExec("text", "to") + opDesc.attribute = "text" + opDesc.keyword = "to" + val opExec = new KeywordSearchOpExec(objectMapper.writeValueAsString(opDesc)) opExec.open() val results = testData.filter(t => opExec.processTuple(t, inputPort).hasNext) assert(results.length == 1) @@ -123,7 +145,9 @@ class KeywordSearchOpExecSpec extends AnyFlatSpec with BeforeAndAfter { } it should "find case-insensitive match for 'twitter'" in { - val opExec = new KeywordSearchOpExec("text", "twitter") + opDesc.attribute = "text" + opDesc.keyword = "twitter" + val opExec = new KeywordSearchOpExec(objectMapper.writeValueAsString(opDesc)) opExec.open() val results = testData.filter(t => opExec.processTuple(t, inputPort).hasNext) assert(results.length == 1) @@ -132,7 +156,9 @@ class KeywordSearchOpExecSpec extends AnyFlatSpec with BeforeAndAfter { } it should "find exact match for Korean text '안녕하세요'" in { - val opExec = new KeywordSearchOpExec("text", "안녕하세요") + opDesc.attribute = "text" + opDesc.keyword = "안녕하세요" + val opExec = new KeywordSearchOpExec(objectMapper.writeValueAsString(opDesc)) opExec.open() val results = testData.filter(t => opExec.processTuple(t, inputPort).hasNext) assert(results.length == 1) @@ -141,7 +167,9 @@ class KeywordSearchOpExecSpec extends AnyFlatSpec with BeforeAndAfter { } it should "find exact match for Chinese text '你好'" in { - val opExec = new KeywordSearchOpExec("text", "你好") + opDesc.attribute = "text" + opDesc.keyword = "你好" + val opExec = new KeywordSearchOpExec(objectMapper.writeValueAsString(opDesc)) opExec.open() val results = testData.filter(t => opExec.processTuple(t, inputPort).hasNext) assert(results.length == 1) @@ -150,7 +178,9 @@ class KeywordSearchOpExecSpec extends AnyFlatSpec with BeforeAndAfter { } it should "find no matches for special character '@'" in { - val opExec = new KeywordSearchOpExec("text", "@") + opDesc.attribute = "text" + opDesc.keyword = "@" + val opExec = new KeywordSearchOpExec(objectMapper.writeValueAsString(opDesc)) opExec.open() val results = testData.filter(t => opExec.processTuple(t, inputPort).hasNext) assert(results.isEmpty) @@ -158,7 +188,9 @@ class KeywordSearchOpExecSpec extends AnyFlatSpec with BeforeAndAfter { } it should "find exact match for special characters '_!@,-'" in { - val opExec = new KeywordSearchOpExec("text", "_!@,-") + opDesc.attribute = "text" + opDesc.keyword = "_!@,-" + val opExec = new KeywordSearchOpExec(objectMapper.writeValueAsString(opDesc)) opExec.open() val results = testData.filter(t => opExec.processTuple(t, inputPort).hasNext) assert(results.isEmpty) diff --git a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/projection/ProjectionOpExecSpec.scala b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/projection/ProjectionOpExecSpec.scala index 0f0a420cf79..edd889734d7 100644 --- a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/projection/ProjectionOpExecSpec.scala +++ b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/projection/ProjectionOpExecSpec.scala @@ -1,6 +1,7 @@ package edu.uci.ics.amber.operator.projection import edu.uci.ics.amber.core.tuple._ +import edu.uci.ics.amber.util.JSONUtils.objectMapper import org.scalatest.BeforeAndAfter import org.scalatest.flatspec.AnyFlatSpec class ProjectionOpExecSpec extends AnyFlatSpec with BeforeAndAfter { @@ -20,31 +21,30 @@ class ProjectionOpExecSpec extends AnyFlatSpec with BeforeAndAfter { true ) .build() + val opDesc: ProjectionOpDesc = new ProjectionOpDesc() it should "open" in { - val projectionOpExec = new ProjectionOpExec( - List( - new AttributeUnit("field2", "f2"), - new AttributeUnit("field1", "f1") - ) + opDesc.attributes = List( + new AttributeUnit("field2", "f2"), + new AttributeUnit("field1", "f1") ) + val projectionOpExec = new ProjectionOpExec(objectMapper.writeValueAsString(opDesc)) projectionOpExec.open() } it should "process Tuple" in { + opDesc.attributes = List( + new AttributeUnit("field2", "f2"), + new AttributeUnit("field1", "f1") + ) val outputSchema = Schema .builder() .add(new Attribute("f1", AttributeType.STRING)) .add(new Attribute("f2", AttributeType.INTEGER)) .build() - val projectionOpExec = new ProjectionOpExec( - List( - new AttributeUnit("field2", "f2"), - new AttributeUnit("field1", "f1") - ) - ) + val projectionOpExec = new ProjectionOpExec(objectMapper.writeValueAsString(opDesc)) projectionOpExec.open() val outputTuple = @@ -59,20 +59,18 @@ class ProjectionOpExecSpec extends AnyFlatSpec with BeforeAndAfter { assert(outputTuple.getField[String](0) == "hello") assert(outputTuple.getField[Int](1) == 1) } - it should "process Tuple with different order" in { + opDesc.attributes = List( + new AttributeUnit("field3", "f3"), + new AttributeUnit("field1", "f1") + ) val outputSchema = Schema .builder() .add(new Attribute("f3", AttributeType.BOOLEAN)) .add(new Attribute("f1", AttributeType.STRING)) .build() - val projectionOpExec = new ProjectionOpExec( - List( - new AttributeUnit("field3", "f3"), - new AttributeUnit("field1", "f1") - ) - ) + val projectionOpExec = new ProjectionOpExec(objectMapper.writeValueAsString(opDesc)) projectionOpExec.open() val outputTuple = @@ -88,54 +86,48 @@ class ProjectionOpExecSpec extends AnyFlatSpec with BeforeAndAfter { assert(outputTuple.getField[String](1) == "hello") } - it should "raise RuntimeException on non-existing fields" in { - val projectionOpExec = new ProjectionOpExec( - List( - new AttributeUnit("field---5", "f5"), - new AttributeUnit("field---6", "f6") - ) + it should "meException on non-existing fields" in { + opDesc.attributes = List( + new AttributeUnit("field---5", "f5"), + new AttributeUnit("field---6", "f6") ) + val projectionOpExec = new ProjectionOpExec(objectMapper.writeValueAsString(opDesc)) assertThrows[RuntimeException] { projectionOpExec.processTuple(tuple, 0).next() } - } it should "raise IllegalArgumentException on empty attributes" in { - val projectionOpExec = new ProjectionOpExec(List()) + opDesc.attributes = List() + val projectionOpExec = new ProjectionOpExec(objectMapper.writeValueAsString(opDesc)) assertThrows[IllegalArgumentException] { projectionOpExec.processTuple(tuple, 0).next() } - } it should "raise RuntimeException on duplicate alias" in { - val projectionOpExec = new ProjectionOpExec( - List( - new AttributeUnit("field1", "f"), - new AttributeUnit("field2", "f") - ) + opDesc.attributes = List( + new AttributeUnit("field1", "f"), + new AttributeUnit("field2", "f") ) - + val projectionOpExec = new ProjectionOpExec(objectMapper.writeValueAsString(opDesc)) assertThrows[RuntimeException] { projectionOpExec.processTuple(tuple, 0).next() } - } it should "allow empty alias" in { + opDesc.attributes = List( + new AttributeUnit("field2", "f2"), + new AttributeUnit("field1", "") + ) val outputSchema = Schema .builder() .add(new Attribute("field1", AttributeType.STRING)) .add(new Attribute("f2", AttributeType.INTEGER)) .build() - val projectionOpExec = new ProjectionOpExec( - List( - new AttributeUnit("field2", "f2"), - new AttributeUnit("field1", "") - ) - ) + val projectionOpExec = new ProjectionOpExec(objectMapper.writeValueAsString(opDesc)) projectionOpExec.open() val outputTuple = @@ -150,5 +142,4 @@ class ProjectionOpExecSpec extends AnyFlatSpec with BeforeAndAfter { assert(outputTuple.getField[String](0) == "hello") assert(outputTuple.getField[Int](1) == 1) } - } diff --git a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/sortPartitions/SortPartitionsOpExecSpec.scala b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/sortPartitions/SortPartitionsOpExecSpec.scala index 896495041d0..aeab7443c1d 100644 --- a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/sortPartitions/SortPartitionsOpExecSpec.scala +++ b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/sortPartitions/SortPartitionsOpExecSpec.scala @@ -1,6 +1,7 @@ package edu.uci.ics.amber.operator.sortPartitions import edu.uci.ics.amber.core.tuple._ +import edu.uci.ics.amber.util.JSONUtils.objectMapper import org.scalatest.BeforeAndAfter import org.scalatest.flatspec.AnyFlatSpec class SortPartitionsOpExecSpec extends AnyFlatSpec with BeforeAndAfter { @@ -22,15 +23,11 @@ class SortPartitionsOpExecSpec extends AnyFlatSpec with BeforeAndAfter { ) .build() - var opExec: SortPartitionOpExec = _ + val opDesc: SortPartitionsOpDesc = new SortPartitionsOpDesc() + opDesc.sortAttributeName = "field2" + var opExec: SortPartitionsOpExec = _ before { - opExec = new SortPartitionOpExec( - "field2", - 0, - 0, - 6, - 1 - ) + opExec = new SortPartitionsOpExec(objectMapper.writeValueAsString(opDesc)) } it should "open" in { diff --git a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/source/fetcher/URLFetcherOpExecSpec.scala b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/source/fetcher/URLFetcherOpExecSpec.scala index f531661df3c..5cef468f7a4 100644 --- a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/source/fetcher/URLFetcherOpExecSpec.scala +++ b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/source/fetcher/URLFetcherOpExecSpec.scala @@ -1,21 +1,28 @@ package edu.uci.ics.amber.operator.source.fetcher import edu.uci.ics.amber.core.tuple.Schema +import edu.uci.ics.amber.util.JSONUtils.objectMapper import org.scalatest.BeforeAndAfter import org.scalatest.flatspec.AnyFlatSpec class URLFetcherOpExecSpec extends AnyFlatSpec with BeforeAndAfter { val resultSchema: Schema = new URLFetcherOpDesc().sourceSchema() + val opDesc: URLFetcherOpDesc = new URLFetcherOpDesc() + it should "fetch url and output one tuple with raw bytes" in { - val fetcherOpExec = new URLFetcherOpExec("https://www.google.com", DecodingMethod.RAW_BYTES) + opDesc.url = "https://www.google.com" + opDesc.decodingMethod = DecodingMethod.RAW_BYTES + val fetcherOpExec = new URLFetcherOpExec(objectMapper.writeValueAsString(opDesc)) val iterator = fetcherOpExec.produceTuple() assert(iterator.next().getFields.toList.head.isInstanceOf[Array[Byte]]) assert(!iterator.hasNext) } it should "fetch url and output one tuple with UTF-8 string" in { - val fetcherOpExec = new URLFetcherOpExec("https://www.google.com", DecodingMethod.UTF_8) + opDesc.url = "https://www.google.com" + opDesc.decodingMethod = DecodingMethod.UTF_8 + val fetcherOpExec = new URLFetcherOpExec(objectMapper.writeValueAsString(opDesc)) val iterator = fetcherOpExec.produceTuple() assert(iterator.next().getFields.toList.head.isInstanceOf[String]) assert(!iterator.hasNext) diff --git a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/source/scan/csv/CSVScanSourceOpDescSpec.scala b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/source/scan/csv/CSVScanSourceOpDescSpec.scala index cb9d031952b..bf1a6122e1c 100644 --- a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/source/scan/csv/CSVScanSourceOpDescSpec.scala +++ b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/source/scan/csv/CSVScanSourceOpDescSpec.scala @@ -26,10 +26,10 @@ class CSVScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter { parallelCsvScanSourceOpDesc.fileName = Some(TestOperators.CountrySalesSmallCsvPath) parallelCsvScanSourceOpDesc.customDelimiter = Some(",") parallelCsvScanSourceOpDesc.hasHeader = true - parallelCsvScanSourceOpDesc.setFileUri( + parallelCsvScanSourceOpDesc.setResolvedFileName( FileResolver.resolve(parallelCsvScanSourceOpDesc.fileName.get) ) - val inferredSchema: Schema = parallelCsvScanSourceOpDesc.inferSchema() + val inferredSchema: Schema = parallelCsvScanSourceOpDesc.sourceSchema() assert(inferredSchema.getAttributes.length == 14) assert(inferredSchema.getAttribute("Order ID").getType == AttributeType.INTEGER) @@ -42,11 +42,11 @@ class CSVScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter { parallelCsvScanSourceOpDesc.fileName = Some(TestOperators.CountrySalesHeaderlessSmallCsvPath) parallelCsvScanSourceOpDesc.customDelimiter = Some(",") parallelCsvScanSourceOpDesc.hasHeader = false - parallelCsvScanSourceOpDesc.setFileUri( + parallelCsvScanSourceOpDesc.setResolvedFileName( FileResolver.resolve(parallelCsvScanSourceOpDesc.fileName.get) ) - val inferredSchema: Schema = parallelCsvScanSourceOpDesc.inferSchema() + val inferredSchema: Schema = parallelCsvScanSourceOpDesc.sourceSchema() assert(inferredSchema.getAttributes.length == 14) assert(inferredSchema.getAttribute("column-10").getType == AttributeType.DOUBLE) @@ -58,9 +58,9 @@ class CSVScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter { csvScanSourceOpDesc.fileName = Some(TestOperators.CountrySalesSmallMultiLineCsvPath) csvScanSourceOpDesc.customDelimiter = Some(",") csvScanSourceOpDesc.hasHeader = true - csvScanSourceOpDesc.setFileUri(FileResolver.resolve(csvScanSourceOpDesc.fileName.get)) + csvScanSourceOpDesc.setResolvedFileName(FileResolver.resolve(csvScanSourceOpDesc.fileName.get)) - val inferredSchema: Schema = csvScanSourceOpDesc.inferSchema() + val inferredSchema: Schema = csvScanSourceOpDesc.sourceSchema() assert(inferredSchema.getAttributes.length == 14) assert(inferredSchema.getAttribute("Order ID").getType == AttributeType.INTEGER) @@ -72,9 +72,9 @@ class CSVScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter { csvScanSourceOpDesc.fileName = Some(TestOperators.CountrySalesHeaderlessSmallCsvPath) csvScanSourceOpDesc.customDelimiter = Some(",") csvScanSourceOpDesc.hasHeader = false - csvScanSourceOpDesc.setFileUri(FileResolver.resolve(csvScanSourceOpDesc.fileName.get)) + csvScanSourceOpDesc.setResolvedFileName(FileResolver.resolve(csvScanSourceOpDesc.fileName.get)) - val inferredSchema: Schema = csvScanSourceOpDesc.inferSchema() + val inferredSchema: Schema = csvScanSourceOpDesc.sourceSchema() assert(inferredSchema.getAttributes.length == 14) assert(inferredSchema.getAttribute("column-10").getType == AttributeType.DOUBLE) @@ -87,9 +87,9 @@ class CSVScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter { Some(TestOperators.CountrySalesSmallMultiLineCustomDelimiterCsvPath) csvScanSourceOpDesc.customDelimiter = Some(";") csvScanSourceOpDesc.hasHeader = false - csvScanSourceOpDesc.setFileUri(FileResolver.resolve(csvScanSourceOpDesc.fileName.get)) + csvScanSourceOpDesc.setResolvedFileName(FileResolver.resolve(csvScanSourceOpDesc.fileName.get)) - val inferredSchema: Schema = csvScanSourceOpDesc.inferSchema() + val inferredSchema: Schema = csvScanSourceOpDesc.sourceSchema() assert(inferredSchema.getAttributes.length == 14) assert(inferredSchema.getAttribute("column-10").getType == AttributeType.DOUBLE) @@ -102,7 +102,7 @@ class CSVScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter { Some(TestOperators.CountrySalesSmallMultiLineCustomDelimiterCsvPath) csvScanSourceOpDesc.customDelimiter = Some(";") csvScanSourceOpDesc.hasHeader = false - csvScanSourceOpDesc.setFileUri(FileResolver.resolve(csvScanSourceOpDesc.fileName.get)) + csvScanSourceOpDesc.setResolvedFileName(FileResolver.resolve(csvScanSourceOpDesc.fileName.get)) assert( !csvScanSourceOpDesc diff --git a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/source/scan/text/FileScanSourceOpDescSpec.scala b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/source/scan/text/FileScanSourceOpDescSpec.scala index 15ba5adc096..e8b07062cb7 100644 --- a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/source/scan/text/FileScanSourceOpDescSpec.scala +++ b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/source/scan/text/FileScanSourceOpDescSpec.scala @@ -9,6 +9,7 @@ import edu.uci.ics.amber.operator.source.scan.{ FileScanSourceOpDesc, FileScanSourceOpExec } +import edu.uci.ics.amber.util.JSONUtils.objectMapper import org.scalatest.BeforeAndAfter import org.scalatest.flatspec.AnyFlatSpec @@ -18,12 +19,12 @@ class FileScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter { before { fileScanSourceOpDesc = new FileScanSourceOpDesc() - fileScanSourceOpDesc.setFileUri(FileResolver.resolve(TestOperators.TestTextFilePath)) + fileScanSourceOpDesc.setResolvedFileName(FileResolver.resolve(TestOperators.TestTextFilePath)) fileScanSourceOpDesc.fileEncoding = FileDecodingMethod.UTF_8 } it should "infer schema with single column representing each line of text in normal text scan mode" in { - val inferredSchema: Schema = fileScanSourceOpDesc.inferSchema() + val inferredSchema: Schema = fileScanSourceOpDesc.sourceSchema() assert(inferredSchema.getAttributes.length == 1) assert(inferredSchema.getAttribute("line").getType == AttributeType.STRING) @@ -31,7 +32,7 @@ class FileScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter { it should "infer schema with single column representing entire file in outputAsSingleTuple mode" in { fileScanSourceOpDesc.attributeType = FileAttributeType.SINGLE_STRING - val inferredSchema: Schema = fileScanSourceOpDesc.inferSchema() + val inferredSchema: Schema = fileScanSourceOpDesc.sourceSchema() assert(inferredSchema.getAttributes.length == 1) assert(inferredSchema.getAttribute("line").getType == AttributeType.STRING) @@ -41,7 +42,7 @@ class FileScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter { fileScanSourceOpDesc.attributeType = FileAttributeType.STRING val customOutputAttributeName: String = "testing" fileScanSourceOpDesc.attributeName = customOutputAttributeName - val inferredSchema: Schema = fileScanSourceOpDesc.inferSchema() + val inferredSchema: Schema = fileScanSourceOpDesc.sourceSchema() assert(inferredSchema.getAttributes.length == 1) assert(inferredSchema.getAttribute("testing").getType == AttributeType.STRING) @@ -49,7 +50,7 @@ class FileScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter { it should "infer schema with integer attribute type" in { fileScanSourceOpDesc.attributeType = FileAttributeType.INTEGER - val inferredSchema: Schema = fileScanSourceOpDesc.inferSchema() + val inferredSchema: Schema = fileScanSourceOpDesc.sourceSchema() assert(inferredSchema.getAttributes.length == 1) assert(inferredSchema.getAttribute("line").getType == AttributeType.INTEGER) @@ -59,15 +60,7 @@ class FileScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter { fileScanSourceOpDesc.attributeType = FileAttributeType.STRING fileScanSourceOpDesc.fileScanLimit = Option(5) val FileScanSourceOpExec = - new FileScanSourceOpExec( - fileScanSourceOpDesc.fileUri.get, - fileScanSourceOpDesc.attributeType, - fileScanSourceOpDesc.fileEncoding, - fileScanSourceOpDesc.extract, - fileScanSourceOpDesc.outputFileName, - fileScanSourceOpDesc.fileScanLimit, - fileScanSourceOpDesc.fileScanOffset - ) + new FileScanSourceOpExec(objectMapper.writeValueAsString(fileScanSourceOpDesc)) FileScanSourceOpExec.open() val processedTuple: Iterator[Tuple] = FileScanSourceOpExec .produceTuple() @@ -85,19 +78,13 @@ class FileScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter { } it should "read first 5 lines of the input text file with CRLF separators into corresponding output tuples" in { - fileScanSourceOpDesc.setFileUri(FileResolver.resolve(TestOperators.TestCRLFTextFilePath)) + fileScanSourceOpDesc.setResolvedFileName( + FileResolver.resolve(TestOperators.TestCRLFTextFilePath) + ) fileScanSourceOpDesc.attributeType = FileAttributeType.STRING fileScanSourceOpDesc.fileScanLimit = Option(5) val FileScanSourceOpExec = - new FileScanSourceOpExec( - fileScanSourceOpDesc.fileUri.get, - fileScanSourceOpDesc.attributeType, - fileScanSourceOpDesc.fileEncoding, - fileScanSourceOpDesc.extract, - fileScanSourceOpDesc.outputFileName, - fileScanSourceOpDesc.fileScanLimit, - fileScanSourceOpDesc.fileScanOffset - ) + new FileScanSourceOpExec(objectMapper.writeValueAsString(fileScanSourceOpDesc)) FileScanSourceOpExec.open() val processedTuple: Iterator[Tuple] = FileScanSourceOpExec .produceTuple() @@ -117,15 +104,7 @@ class FileScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter { it should "read first 5 lines of the input text file into a single output tuple" in { fileScanSourceOpDesc.attributeType = FileAttributeType.SINGLE_STRING val FileScanSourceOpExec = - new FileScanSourceOpExec( - fileScanSourceOpDesc.fileUri.get, - fileScanSourceOpDesc.attributeType, - fileScanSourceOpDesc.fileEncoding, - fileScanSourceOpDesc.extract, - fileScanSourceOpDesc.outputFileName, - fileScanSourceOpDesc.fileScanLimit, - fileScanSourceOpDesc.fileScanOffset - ) + new FileScanSourceOpExec(objectMapper.writeValueAsString(fileScanSourceOpDesc)) FileScanSourceOpExec.open() val processedTuple: Iterator[Tuple] = FileScanSourceOpExec .produceTuple() @@ -144,18 +123,13 @@ class FileScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter { } it should "read first 5 lines of the input text into corresponding output INTEGER tuples" in { - fileScanSourceOpDesc.setFileUri(FileResolver.resolve(TestOperators.TestNumbersFilePath)) + fileScanSourceOpDesc.setResolvedFileName( + FileResolver.resolve(TestOperators.TestNumbersFilePath) + ) fileScanSourceOpDesc.attributeType = FileAttributeType.INTEGER fileScanSourceOpDesc.fileScanLimit = Option(5) - val FileScanSourceOpExec = new FileScanSourceOpExec( - fileScanSourceOpDesc.fileUri.get, - fileScanSourceOpDesc.attributeType, - fileScanSourceOpDesc.fileEncoding, - fileScanSourceOpDesc.extract, - fileScanSourceOpDesc.outputFileName, - fileScanSourceOpDesc.fileScanLimit, - fileScanSourceOpDesc.fileScanOffset - ) + val FileScanSourceOpExec = + new FileScanSourceOpExec(objectMapper.writeValueAsString(fileScanSourceOpDesc)) FileScanSourceOpExec.open() val processedTuple: Iterator[Tuple] = FileScanSourceOpExec .produceTuple() @@ -173,20 +147,14 @@ class FileScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter { } it should "read first 5 lines of the input text file with US_ASCII encoding" in { - fileScanSourceOpDesc.setFileUri(FileResolver.resolve(TestOperators.TestCRLFTextFilePath)) + fileScanSourceOpDesc.setResolvedFileName( + FileResolver.resolve(TestOperators.TestCRLFTextFilePath) + ) fileScanSourceOpDesc.fileEncoding = FileDecodingMethod.ASCII fileScanSourceOpDesc.attributeType = FileAttributeType.STRING fileScanSourceOpDesc.fileScanLimit = Option(5) val FileScanSourceOpExec = - new FileScanSourceOpExec( - fileScanSourceOpDesc.fileUri.get, - fileScanSourceOpDesc.attributeType, - fileScanSourceOpDesc.fileEncoding, - fileScanSourceOpDesc.extract, - fileScanSourceOpDesc.outputFileName, - fileScanSourceOpDesc.fileScanLimit, - fileScanSourceOpDesc.fileScanOffset - ) + new FileScanSourceOpExec(objectMapper.writeValueAsString(fileScanSourceOpDesc)) FileScanSourceOpExec.open() val processedTuple: Iterator[Tuple] = FileScanSourceOpExec .produceTuple() diff --git a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/source/scan/text/TextInputSourceOpDescSpec.scala b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/source/scan/text/TextInputSourceOpDescSpec.scala index 8f600070fc1..e94e3d9570e 100644 --- a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/source/scan/text/TextInputSourceOpDescSpec.scala +++ b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/source/scan/text/TextInputSourceOpDescSpec.scala @@ -3,6 +3,7 @@ package edu.uci.ics.amber.operator.source.scan.text import edu.uci.ics.amber.core.tuple.{AttributeType, Schema, SchemaEnforceable, Tuple} import edu.uci.ics.amber.operator.TestOperators import edu.uci.ics.amber.operator.source.scan.FileAttributeType +import edu.uci.ics.amber.util.JSONUtils.objectMapper import org.scalatest.BeforeAndAfter import org.scalatest.flatspec.AnyFlatSpec @@ -51,8 +52,11 @@ class TextInputSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter { it should "read first 5 lines of the input text into corresponding output tuples" in { val inputString: String = readFileIntoString(TestOperators.TestTextFilePath) + textInputSourceOpDesc.attributeType = FileAttributeType.STRING + textInputSourceOpDesc.textInput = inputString + textInputSourceOpDesc.fileScanLimit = Option(5) val textScanSourceOpExec = - new TextInputSourceOpExec(FileAttributeType.STRING, inputString, fileScanLimit = Option(5)) + new TextInputSourceOpExec(objectMapper.writeValueAsString(textInputSourceOpDesc)) textScanSourceOpExec.open() val processedTuple: Iterator[Tuple] = textScanSourceOpExec .produceTuple() @@ -73,8 +77,11 @@ class TextInputSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter { it should "read first 5 lines of the input text with CRLF separators into corresponding output tuples" in { val inputString: String = readFileIntoString(TestOperators.TestCRLFTextFilePath) + textInputSourceOpDesc.attributeType = FileAttributeType.STRING + textInputSourceOpDesc.textInput = inputString + textInputSourceOpDesc.fileScanLimit = Option(5) val textScanSourceOpExec = - new TextInputSourceOpExec(FileAttributeType.STRING, inputString, fileScanLimit = Option(5)) + new TextInputSourceOpExec(objectMapper.writeValueAsString(textInputSourceOpDesc)) textScanSourceOpExec.open() val processedTuple: Iterator[Tuple] = textScanSourceOpExec .produceTuple() @@ -95,8 +102,10 @@ class TextInputSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter { it should "read first 5 lines of the input text into a single output tuple" in { val inputString: String = readFileIntoString(TestOperators.TestTextFilePath) + textInputSourceOpDesc.attributeType = FileAttributeType.SINGLE_STRING + textInputSourceOpDesc.textInput = inputString val textScanSourceOpExec = - new TextInputSourceOpExec(FileAttributeType.SINGLE_STRING, inputString) + new TextInputSourceOpExec(objectMapper.writeValueAsString(textInputSourceOpDesc)) textScanSourceOpExec.open() val processedTuple: Iterator[Tuple] = textScanSourceOpExec .produceTuple() @@ -119,8 +128,10 @@ class TextInputSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter { it should "read first 5 lines of the input text into corresponding output INTEGER tuples" in { val inputString: String = readFileIntoString(TestOperators.TestNumbersFilePath) textInputSourceOpDesc.attributeType = FileAttributeType.INTEGER + textInputSourceOpDesc.textInput = inputString + textInputSourceOpDesc.fileScanLimit = Option(5) val textScanSourceOpExec = - new TextInputSourceOpExec(FileAttributeType.INTEGER, inputString, fileScanLimit = Option(5)) + new TextInputSourceOpExec(objectMapper.writeValueAsString(textInputSourceOpDesc)) textScanSourceOpExec.open() val processedTuple: Iterator[Tuple] = textScanSourceOpExec .produceTuple() diff --git a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/typecasting/TypeCastingOpExecSpec.scala b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/typecasting/TypeCastingOpExecSpec.scala index ae1070f62ba..39b3b5e8d60 100644 --- a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/typecasting/TypeCastingOpExecSpec.scala +++ b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/typecasting/TypeCastingOpExecSpec.scala @@ -1,6 +1,7 @@ package edu.uci.ics.amber.operator.typecasting import edu.uci.ics.amber.core.tuple._ +import edu.uci.ics.amber.util.JSONUtils.objectMapper import org.scalatest.BeforeAndAfter import org.scalatest.flatspec.AnyFlatSpec class TypeCastingOpExecSpec extends AnyFlatSpec with BeforeAndAfter { @@ -27,6 +28,8 @@ class TypeCastingOpExecSpec extends AnyFlatSpec with BeforeAndAfter { castingUnit2.resultType = AttributeType.STRING val castingUnits: List[TypeCastingUnit] = List(castingUnit1, castingUnit2) + val opDesc: TypeCastingOpDesc = new TypeCastingOpDesc() + opDesc.typeCastingUnits = castingUnits val tuple: Tuple = Tuple .builder(tupleSchema) .add(new Attribute("field1", AttributeType.STRING), "hello") @@ -42,14 +45,15 @@ class TypeCastingOpExecSpec extends AnyFlatSpec with BeforeAndAfter { .build() it should "open" in { - val typeCastingOpExec = new TypeCastingOpExec(castingUnits) + + val typeCastingOpExec = new TypeCastingOpExec(objectMapper.writeValueAsString(opDesc)) typeCastingOpExec.open() } it should "process Tuple" in { - val typeCastingOpExec = new TypeCastingOpExec(castingUnits) + val typeCastingOpExec = new TypeCastingOpExec(objectMapper.writeValueAsString(opDesc)) typeCastingOpExec.open() diff --git a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/unneststring/UnnestStringOpExecSpec.scala b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/unneststring/UnnestStringOpExecSpec.scala index 96cfb08acde..8ce75b6fd5f 100644 --- a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/unneststring/UnnestStringOpExecSpec.scala +++ b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/unneststring/UnnestStringOpExecSpec.scala @@ -2,6 +2,7 @@ package edu.uci.ics.amber.operator.unneststring import edu.uci.ics.amber.core.tuple._ import edu.uci.ics.amber.core.workflow.PortIdentity +import edu.uci.ics.amber.util.JSONUtils.objectMapper import org.scalatest.BeforeAndAfter import org.scalatest.flatspec.AnyFlatSpec class UnnestStringOpExecSpec extends AnyFlatSpec with BeforeAndAfter { @@ -32,14 +33,18 @@ class UnnestStringOpExecSpec extends AnyFlatSpec with BeforeAndAfter { } it should "open" in { - opExec = new UnnestStringOpExec(attributeName = "field1", delimiter = "-") + opDesc.attribute = "field1" + opDesc.delimiter = "-" + opExec = new UnnestStringOpExec(objectMapper.writeValueAsString(opDesc)) outputSchema = opDesc.getOutputSchema(Array(tupleSchema)) opExec.open() assert(opExec.flatMapFunc != null) } it should "split value in the given attribute and output the split result in the result attribute, one for each tuple" in { - opExec = new UnnestStringOpExec(attributeName = "field1", delimiter = "-") + opDesc.attribute = "field1" + opDesc.delimiter = "-" + opExec = new UnnestStringOpExec(objectMapper.writeValueAsString(opDesc)) outputSchema = opDesc.getOutputSchema(Array(tupleSchema)) opExec.open() val processedTuple = opExec @@ -54,7 +59,8 @@ class UnnestStringOpExecSpec extends AnyFlatSpec with BeforeAndAfter { it should "generate the correct tuple when there is no delimiter in the value" in { opDesc.attribute = "field3" - opExec = new UnnestStringOpExec(attributeName = "field3", delimiter = "-") + opDesc.delimiter = "-" + opExec = new UnnestStringOpExec(objectMapper.writeValueAsString(opDesc)) outputSchema = opDesc.getOutputSchema(Array(tupleSchema)) opExec.open() val processedTuple = opExec @@ -66,8 +72,9 @@ class UnnestStringOpExecSpec extends AnyFlatSpec with BeforeAndAfter { } it should "only contain split results that are not null" in { + opDesc.attribute = "field1" opDesc.delimiter = "/" - opExec = new UnnestStringOpExec(attributeName = "field1", delimiter = "/") + opExec = new UnnestStringOpExec(objectMapper.writeValueAsString(opDesc)) outputSchema = opDesc.getOutputSchema(Array(tupleSchema)) val tuple: Tuple = Tuple .builder(tupleSchema) @@ -87,8 +94,9 @@ class UnnestStringOpExecSpec extends AnyFlatSpec with BeforeAndAfter { } it should "split by regex delimiter" in { + opDesc.attribute = "field1" opDesc.delimiter = "<\\d*>" - opExec = new UnnestStringOpExec(attributeName = "field1", delimiter = "<\\d*>") + opExec = new UnnestStringOpExec(objectMapper.writeValueAsString(opDesc)) outputSchema = opDesc.getOutputSchema(Array(tupleSchema)) val tuple: Tuple = Tuple .builder(tupleSchema) diff --git a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/visualization/htmlviz/HtmlVizOpExecSpec.scala b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/visualization/htmlviz/HtmlVizOpExecSpec.scala index 820534496bd..f8aa526a0c9 100644 --- a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/visualization/htmlviz/HtmlVizOpExecSpec.scala +++ b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/visualization/htmlviz/HtmlVizOpExecSpec.scala @@ -1,6 +1,7 @@ package edu.uci.ics.amber.operator.visualization.htmlviz import edu.uci.ics.amber.core.tuple._ +import edu.uci.ics.amber.util.JSONUtils.objectMapper import org.scalatest.BeforeAndAfter import org.scalatest.flatspec.AnyFlatSpec class HtmlVizOpExecSpec extends AnyFlatSpec with BeforeAndAfter { @@ -8,9 +9,9 @@ class HtmlVizOpExecSpec extends AnyFlatSpec with BeforeAndAfter { new Attribute("field1", AttributeType.STRING), new Attribute("field2", AttributeType.STRING) ) - val desc: HtmlVizOpDesc = new HtmlVizOpDesc() + val opDesc: HtmlVizOpDesc = new HtmlVizOpDesc() - val outputSchema: Schema = desc.getOutputSchema(Array(schema)) + val outputSchema: Schema = opDesc.getOutputSchema(Array(schema)) def tuple(): Tuple = Tuple @@ -19,7 +20,8 @@ class HtmlVizOpExecSpec extends AnyFlatSpec with BeforeAndAfter { .build() it should "process a target field" in { - val htmlVizOpExec = new HtmlVizOpExec("field1") + opDesc.htmlContentAttrName = "field1" + val htmlVizOpExec = new HtmlVizOpExec(objectMapper.writeValueAsString(opDesc)) htmlVizOpExec.open() val processedTuple: Tuple = htmlVizOpExec @@ -33,8 +35,8 @@ class HtmlVizOpExecSpec extends AnyFlatSpec with BeforeAndAfter { } it should "process another target field" in { - - val htmlVizOpExec = new HtmlVizOpExec("field2") + opDesc.htmlContentAttrName = "field2" + val htmlVizOpExec = new HtmlVizOpExec(objectMapper.writeValueAsString(opDesc)) htmlVizOpExec.open() val processedTuple: Tuple = htmlVizOpExec From 90f99e52710dc7daf307fbf708a6efde96c7ea55 Mon Sep 17 00:00:00 2001 From: Shengquan Ni <13672781+shengquan-ni@users.noreply.github.com> Date: Tue, 31 Dec 2024 13:41:15 -0800 Subject: [PATCH 04/10] Enhance error handling and stack trace formatting (#3185) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR improves exception handling in AsyncRPCServer to unwrap the actual exception from InvocationTargetException. Old: 截屏2024-12-31 上午2 38 34 New: 截屏2024-12-31 上午2 33 18 --- .../engine/common/rpc/AsyncRPCServer.scala | 8 ++++++- .../edu/uci/ics/amber/error/ErrorUtils.scala | 22 +++++++++++-------- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/rpc/AsyncRPCServer.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/rpc/AsyncRPCServer.scala index eae8f455049..1977bc9764d 100644 --- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/rpc/AsyncRPCServer.scala +++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/common/rpc/AsyncRPCServer.scala @@ -59,7 +59,13 @@ class AsyncRPCServer( ): Unit = { try { val result = - method.invoke(handler, requestArg, contextArg) + try { + method.invoke(handler, requestArg, contextArg) + } catch { + case e: java.lang.reflect.InvocationTargetException => + throw Option(e.getCause).getOrElse(e) + case e: Throwable => throw e + } result .asInstanceOf[Future[ControlReturn]] .onSuccess { ret => diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/error/ErrorUtils.scala b/core/amber/src/main/scala/edu/uci/ics/amber/error/ErrorUtils.scala index c1569587fbb..f2f97d56192 100644 --- a/core/amber/src/main/scala/edu/uci/ics/amber/error/ErrorUtils.scala +++ b/core/amber/src/main/scala/edu/uci/ics/amber/error/ErrorUtils.scala @@ -39,8 +39,13 @@ object ErrorUtils { } def mkControlError(err: Throwable): ControlError = { - val stacktrace = err.getStackTrace.mkString("\n") - ControlError(err.toString, err.getCause.toString, stacktrace, ErrorLanguage.SCALA) + // Format each stack trace element with "at " prefix + val stacktrace = err.getStackTrace.map(element => s"at ${element}").mkString("\n") + if (err.getCause != null) { + ControlError(err.toString, err.getCause.toString, stacktrace, ErrorLanguage.SCALA) + } else { + ControlError(err.toString, "", stacktrace, ErrorLanguage.SCALA) + } } def reconstructThrowable(controlError: ControlError): Throwable = { @@ -52,14 +57,13 @@ object ErrorUtils { val causeThrowable = new Throwable(controlError.errorDetails) reconstructedThrowable.initCause(causeThrowable) } - val stackTraceElements = controlError.stackTrace.split("\n").map { line => - // You need to split each line appropriately to extract the class, method, file, and line number - val stackTracePattern = """\s*at\s+(.+)\((.+):(\d+)\)""".r + + val stackTracePattern = """\s*at\s+(.+)\((.*)\)""".r + val stackTraceElements = controlError.stackTrace.split("\n").flatMap { line => line match { - case stackTracePattern(className, fileName, lineNumber) => - new StackTraceElement(className, "", fileName, lineNumber.toInt) - case _ => - new StackTraceElement("", "", null, -1) // Handle if stack trace format is invalid + case stackTracePattern(className, location) => + Some(new StackTraceElement(className, "", location, -1)) + case _ => None } } reconstructedThrowable.setStackTrace(stackTraceElements) From 0eb36d095734f1c510e35999be72ebf25433dd5f Mon Sep 17 00:00:00 2001 From: Yicong Huang <17627829+Yicong-Huang@users.noreply.github.com> Date: Tue, 31 Dec 2024 15:14:24 -0800 Subject: [PATCH 05/10] Remove logical schema propagation (#3186) This PR removes all schema propagation functions from the logical plan. Developers are now required to implement `SchemaPropagationFunc` directly within the PhysicalPlan. This ensures that each PhysicalOp has its own distinct schema propagation logic, aligning schema handling more closely with the execution layer. To accommodate the need for schema propagation in the logical plan (primarily for testing purposes), a new method, `getExternalOutputSchemas`, has been introduced. This method facilitates the propagation of schemas across all PhysicalOps within a logical operator, ensuring compatibility with existing testing workflows. --- .../ics/amber/compiler/WorkflowCompiler.scala | 5 +- .../amber/core/workflow/PhysicalPlan.scala | 28 +++++- .../uci/ics/amber/operator/LogicalOp.scala | 59 ++++++----- .../operator/PythonOperatorDescriptor.scala | 15 +-- .../operator/aggregate/AggregateOpDesc.scala | 52 +++------- .../CartesianProductOpDesc.scala | 98 ++++++++----------- .../dictionary/DictionaryMatcherOpDesc.scala | 25 +++-- .../difference/DifferenceOpDesc.scala | 16 ++- .../operator/distinct/DistinctOpDesc.scala | 13 +-- .../amber/operator/dummy/DummyOpDesc.scala | 3 - .../amber/operator/filter/FilterOpDesc.scala | 9 +- .../operator/hashJoin/HashJoinOpDesc.scala | 60 +++++------- ...gingFaceIrisLogisticRegressionOpDesc.scala | 20 ++-- .../HuggingFaceSentimentAnalysisOpDesc.scala | 22 +++-- .../HuggingFaceSpamSMSDetectionOpDesc.scala | 20 ++-- .../HuggingFaceTextSummarizationOpDesc.scala | 18 ++-- .../operator/intersect/IntersectOpDesc.scala | 13 +-- .../intervalJoin/IntervalJoinOpDesc.scala | 42 +++----- .../amber/operator/limit/LimitOpDesc.scala | 3 - .../Scorer/MachineLearningScorerOpDesc.scala | 8 +- .../base/SklearnAdvancedBaseDesc.scala | 8 +- .../ics/amber/operator/map/MapOpDesc.scala | 20 +--- .../projection/ProjectionOpDesc.scala | 46 ++++----- .../ReservoirSamplingOpDesc.scala | 7 -- .../sentiment/SentimentAnalysisOpDesc.scala | 19 ++-- .../sklearn/SklearnClassifierOpDesc.scala | 16 +-- .../SklearnLinearRegressionOpDesc.scala | 16 +-- .../sklearn/SklearnPredictionOpDesc.scala | 19 ++-- .../ics/amber/operator/sort/SortOpDesc.scala | 7 +- .../sortPartitions/SortPartitionsOpDesc.scala | 7 -- .../source/SourceOperatorDescriptor.scala | 6 -- .../reddit/RedditSearchSourceOpDesc.scala | 6 +- .../amber/operator/split/SplitOpDesc.scala | 23 ++--- .../SymmetricDifferenceOpDesc.scala | 21 ++-- .../typecasting/TypeCastingOpDesc.scala | 6 -- .../operator/udf/java/JavaUDFOpDesc.scala | 20 ---- .../DualInputPortsPythonUDFOpDescV2.scala | 66 +++++-------- .../python/PythonLambdaFunctionOpDesc.scala | 13 ++- .../udf/python/PythonTableReducerOpDesc.scala | 30 +++--- .../udf/python/PythonUDFOpDescV2.scala | 55 +++-------- .../source/PythonUDFSourceOpDescV2.scala | 12 +-- .../ics/amber/operator/udf/r/RUDFOpDesc.scala | 47 ++------- .../operator/udf/r/RUDFSourceOpDesc.scala | 11 +-- .../amber/operator/union/UnionOpDesc.scala | 13 +-- .../unneststring/UnnestStringOpDesc.scala | 25 +++-- .../visualization/DotPlot/DotPlotOpDesc.scala | 12 ++- .../IcicleChart/IcicleChartOpDesc.scala | 12 ++- .../ImageViz/ImageVisualizerOpDesc.scala | 12 ++- .../ScatterMatrixChartOpDesc.scala | 12 ++- .../barChart/BarChartOpDesc.scala | 16 ++- .../visualization/boxPlot/BoxPlotOpDesc.scala | 12 ++- .../bubbleChart/BubbleChartOpDesc.scala | 12 ++- .../CandlestickChartOpDesc.scala | 12 ++- .../ContinuousErrorBandsOpDesc.scala | 12 ++- .../contourPlot/ContourPlotOpDesc.scala | 12 ++- .../dumbbellPlot/DumbbellPlotOpDesc.scala | 12 ++- .../FigureFactoryTableOpDesc.scala | 12 ++- .../filledAreaPlot/FilledAreaPlotOpDesc.scala | 12 ++- .../funnelPlot/FunnelPlotOpDesc.scala | 12 ++- .../ganttChart/GanttChartOpDesc.scala | 12 ++- .../visualization/heatMap/HeatMapOpDesc.scala | 12 ++- .../hierarchychart/HierarchyChartOpDesc.scala | 13 ++- .../histogram/HistogramChartOpDesc.scala | 12 ++- .../visualization/htmlviz/HtmlVizOpDesc.scala | 19 ++-- .../lineChart/LineChartOpDesc.scala | 12 ++- .../pieChart/PieChartOpDesc.scala | 12 ++- .../quiverPlot/QuiverPlotOpDesc.scala | 12 ++- .../sankeyDiagram/SankeyDiagramOpDesc.scala | 12 ++- .../scatter3DChart/Scatter3dChartOpDesc.scala | 12 ++- .../scatterplot/ScatterplotOpDesc.scala | 12 ++- .../tablesChart/TablesPlotOpDesc.scala | 12 ++- .../ternaryPlot/TernaryPlotOpDesc.scala | 13 ++- .../visualization/urlviz/UrlVizOpDesc.scala | 20 ++-- .../waterfallChart/WaterfallChartOpDesc.scala | 12 ++- .../wordCloud/WordCloudOpDesc.scala | 12 ++- .../CartesianProductOpExecSpec.scala | 5 +- .../DictionaryMatcherOpExecSpec.scala | 3 +- .../operator/hashJoin/HashJoinOpSpec.scala | 21 ++-- .../intervalJoin/IntervalOpExecSpec.scala | 7 +- .../projection/ProjectionOpDescSpec.scala | 29 ++---- .../scan/csv/CSVScanSourceOpDescSpec.scala | 5 - .../PythonLambdaFunctionOpDescSpec.scala | 9 +- .../unneststring/UnnestStringOpExecSpec.scala | 12 +-- .../htmlviz/HtmlVizOpExecSpec.scala | 4 +- 84 files changed, 754 insertions(+), 770 deletions(-) diff --git a/core/workflow-compiling-service/src/main/scala/edu/uci/ics/amber/compiler/WorkflowCompiler.scala b/core/workflow-compiling-service/src/main/scala/edu/uci/ics/amber/compiler/WorkflowCompiler.scala index 6dd8ebbce50..c199dfaf863 100644 --- a/core/workflow-compiling-service/src/main/scala/edu/uci/ics/amber/compiler/WorkflowCompiler.scala +++ b/core/workflow-compiling-service/src/main/scala/edu/uci/ics/amber/compiler/WorkflowCompiler.scala @@ -110,6 +110,8 @@ class WorkflowCompiler( logicalPlan.getTopologicalOpIds.asScala.foreach(logicalOpId => Try { val logicalOp = logicalPlan.getOperator(logicalOpId) + val allUpstreamLinks = logicalPlan + .getUpstreamLinks(logicalOp.operatorIdentifier) val subPlan = logicalOp.getPhysicalPlan(context.workflowId, context.executionId) subPlan @@ -117,8 +119,7 @@ class WorkflowCompiler( .map(subPlan.getOperator) .foreach({ physicalOp => { - val externalLinks = logicalPlan - .getUpstreamLinks(logicalOp.operatorIdentifier) + val externalLinks = allUpstreamLinks .filter(link => physicalOp.inputPorts.contains(link.toPortId)) .flatMap { link => physicalPlan diff --git a/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/workflow/PhysicalPlan.scala b/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/workflow/PhysicalPlan.scala index a405ea646da..1c3d06519c0 100644 --- a/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/workflow/PhysicalPlan.scala +++ b/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/workflow/PhysicalPlan.scala @@ -2,13 +2,13 @@ package edu.uci.ics.amber.core.workflow import com.fasterxml.jackson.annotation.JsonIgnore import com.typesafe.scalalogging.LazyLogging -import edu.uci.ics.amber.util.VirtualIdentityUtils +import edu.uci.ics.amber.core.tuple.Schema import edu.uci.ics.amber.core.virtualidentity.{ ActorVirtualIdentity, OperatorIdentity, PhysicalOpIdentity } -import edu.uci.ics.amber.core.workflow.PhysicalLink +import edu.uci.ics.amber.util.VirtualIdentityUtils import org.jgrapht.alg.connectivity.BiconnectivityInspector import org.jgrapht.alg.shortestpath.AllDirectedPaths import org.jgrapht.graph.DirectedAcyclicGraph @@ -285,4 +285,28 @@ case class PhysicalPlan( chains.filter(s1 => chains.forall(s2 => s1 == s2 || !s1.subsetOf(s2))).toSet } + def propagateSchema(inputSchemas: Map[PortIdentity, Schema]): PhysicalPlan = { + var physicalPlan = PhysicalPlan(operators = Set.empty, links = Set.empty) + this + .topologicalIterator() + .map(this.getOperator) + .foreach({ physicalOp => + { + val propagatedPhysicalOp = physicalOp.inputPorts.keys.foldLeft(physicalOp) { + (op, inputPortId) => + op.propagateSchema(inputSchemas.get(inputPortId).map(schema => (inputPortId, schema))) + } + + // Add the operator to the physical plan + physicalPlan = physicalPlan.addOperator(propagatedPhysicalOp.propagateSchema()) + + // Add internal links to the physical plan + physicalPlan = getUpstreamPhysicalLinks(physicalOp.id).foldLeft(physicalPlan) { + (plan, link) => + plan.addLink(link) + } + } + }) + physicalPlan + } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/LogicalOp.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/LogicalOp.scala index 08e750ed8c3..56316f4e23c 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/LogicalOp.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/LogicalOp.scala @@ -5,7 +5,13 @@ import com.fasterxml.jackson.annotation._ import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle import edu.uci.ics.amber.core.executor.OperatorExecutor import edu.uci.ics.amber.core.tuple.Schema -import edu.uci.ics.amber.core.workflow.{PhysicalOp, PhysicalPlan} +import edu.uci.ics.amber.core.virtualidentity.{ + ExecutionIdentity, + OperatorIdentity, + WorkflowIdentity +} +import edu.uci.ics.amber.core.workflow.WorkflowContext.{DEFAULT_EXECUTION_ID, DEFAULT_WORKFLOW_ID} +import edu.uci.ics.amber.core.workflow.{PhysicalOp, PhysicalPlan, PortIdentity} import edu.uci.ics.amber.operator.aggregate.AggregateOpDesc import edu.uci.ics.amber.operator.cartesianProduct.CartesianProductOpDesc import edu.uci.ics.amber.operator.dictionary.DictionaryMatcherOpDesc @@ -94,16 +100,9 @@ import edu.uci.ics.amber.operator.visualization.ternaryPlot.TernaryPlotOpDesc import edu.uci.ics.amber.operator.visualization.urlviz.UrlVizOpDesc import edu.uci.ics.amber.operator.visualization.waterfallChart.WaterfallChartOpDesc import edu.uci.ics.amber.operator.visualization.wordCloud.WordCloudOpDesc -import edu.uci.ics.amber.core.virtualidentity.{ - ExecutionIdentity, - OperatorIdentity, - WorkflowIdentity -} -import edu.uci.ics.amber.core.workflow.PortIdentity import org.apache.commons.lang3.builder.{EqualsBuilder, HashCodeBuilder, ToStringBuilder} import java.util.UUID -import scala.collection.mutable import scala.util.Try trait StateTransferFunc @@ -288,19 +287,12 @@ abstract class LogicalOp extends PortDescriptor with Serializable { @JsonProperty(PropertyNameConstants.OPERATOR_VERSION) var operatorVersion: String = getOperatorVersion - @JsonIgnore - val inputPortToSchemaMapping: mutable.Map[PortIdentity, Schema] = mutable.HashMap() - @JsonIgnore - val outputPortToSchemaMapping: mutable.Map[PortIdentity, Schema] = mutable.HashMap() - def operatorIdentifier: OperatorIdentity = OperatorIdentity(operatorId) def getPhysicalOp( workflowId: WorkflowIdentity, executionId: ExecutionIdentity - ): PhysicalOp = { - ??? - } + ): PhysicalOp = ??? // a logical operator corresponds multiple physical operators (a small DAG) def getPhysicalPlan( @@ -315,19 +307,12 @@ abstract class LogicalOp extends PortDescriptor with Serializable { def operatorInfo: OperatorInfo - def getOutputSchema(schemas: Array[Schema]): Schema - private def getOperatorVersion: String = { val path = "core/amber/src/main/scala/" val operatorPath = path + this.getClass.getPackage.getName.replace(".", "/") OPVersion.getVersion(this.getClass.getSimpleName, operatorPath) } - // override if the operator has multiple output ports, schema must be specified for each port - def getOutputSchemas(schemas: Array[Schema]): Array[Schema] = { - Array.fill(1)(getOutputSchema(schemas)) - } - override def hashCode: Int = HashCodeBuilder.reflectionHashCode(this) override def equals(that: Any): Boolean = EqualsBuilder.reflectionEquals(this, that, "context") @@ -354,4 +339,32 @@ abstract class LogicalOp extends PortDescriptor with Serializable { @JsonPropertyDescription("Add dummy property if needed") var dummyPropertyList: List[DummyProperties] = List() + /** + * Propagates the schema from external input ports to external output ports. + * This method is primarily used to derive the output schemas for logical operators. + * + * @param inputSchemas A map containing the schemas of the external input ports. + * @return A map of external output port identities to their corresponding schemas. + */ + def getExternalOutputSchemas( + inputSchemas: Map[PortIdentity, Schema] + ): Map[PortIdentity, Schema] = { + this + .getPhysicalPlan(DEFAULT_WORKFLOW_ID, DEFAULT_EXECUTION_ID) + .propagateSchema(inputSchemas) + .operators + .flatMap { operator => + operator.outputPorts.values + .filterNot { case (port, _, _) => port.id.internal } // Exclude internal ports + .map { + case (port, _, schemaEither) => + schemaEither match { + case Left(error) => throw error + case Right(schema) => + port.id -> schema // Map external port ID to its schema + } + } + } + .toMap + } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/PythonOperatorDescriptor.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/PythonOperatorDescriptor.scala index 941db76f9d5..479352daa0c 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/PythonOperatorDescriptor.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/PythonOperatorDescriptor.scala @@ -1,8 +1,9 @@ package edu.uci.ics.amber.operator import edu.uci.ics.amber.core.executor.OpExecWithCode +import edu.uci.ics.amber.core.tuple.Schema import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} -import edu.uci.ics.amber.core.workflow.{PhysicalOp, SchemaPropagationFunc} +import edu.uci.ics.amber.core.workflow.{PhysicalOp, PortIdentity, SchemaPropagationFunc} trait PythonOperatorDescriptor extends LogicalOp { override def getPhysicalOp( @@ -29,15 +30,7 @@ trait PythonOperatorDescriptor extends LogicalOp { .withInputPorts(operatorInfo.inputPorts) .withOutputPorts(operatorInfo.outputPorts) .withParallelizable(parallelizable()) - .withPropagateSchema( - SchemaPropagationFunc(inputSchemas => - Map( - operatorInfo.outputPorts.head.id -> getOutputSchema( - operatorInfo.inputPorts.map(_.id).map(inputSchemas(_)).toArray - ) - ) - ) - ) + .withPropagateSchema(SchemaPropagationFunc(inputSchemas => getOutputSchemas(inputSchemas))) } def parallelizable(): Boolean = false @@ -52,4 +45,6 @@ trait PythonOperatorDescriptor extends LogicalOp { */ def generatePythonCode(): String + def getOutputSchemas(inputSchemas: Map[PortIdentity, Schema]): Map[PortIdentity, Schema] + } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/aggregate/AggregateOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/aggregate/AggregateOpDesc.scala index 14c138562f4..0ea2557f4ef 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/aggregate/AggregateOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/aggregate/AggregateOpDesc.scala @@ -34,7 +34,7 @@ class AggregateOpDesc extends LogicalOp { workflowId: WorkflowIdentity, executionId: ExecutionIdentity ): PhysicalPlan = { - + if (groupByKeys == null) groupByKeys = List() // TODO: this is supposed to be blocking but due to limitations of materialization naming on the logical operator // we are keeping it not annotated as blocking. val inputPort = InputPort(PortIdentity()) @@ -53,12 +53,17 @@ class AggregateOpDesc extends LogicalOp { .withOutputPorts(List(outputPort)) .withPropagateSchema( SchemaPropagationFunc(inputSchemas => { - aggregations = localAggregations - Map( - PortIdentity(internal = true) -> getOutputSchema( - operatorInfo.inputPorts.map(port => inputSchemas(port.id)).toArray + val inputSchema = inputSchemas(operatorInfo.inputPorts.head.id) + val outputSchema = Schema + .builder() + .add(groupByKeys.map(key => inputSchema.getAttribute(key)): _*) + .add( + localAggregations.map(agg => + agg.getAggregationAttribute(inputSchema.getAttribute(agg.attribute).getType) + ) ) - ) + .build() + Map(PortIdentity(internal = true) -> outputSchema) }) ) @@ -81,9 +86,7 @@ class AggregateOpDesc extends LogicalOp { .withOutputPorts(List(finalOutputPort)) .withPropagateSchema( SchemaPropagationFunc(inputSchemas => - Map(operatorInfo.outputPorts.head.id -> { - inputSchemas(finalInputPort.id) - }) + Map(operatorInfo.outputPorts.head.id -> inputSchemas(finalInputPort.id)) ) ) .withPartitionRequirement(List(Option(HashPartition(groupByKeys)))) @@ -104,34 +107,7 @@ class AggregateOpDesc extends LogicalOp { "Aggregate", "Calculate different types of aggregation values", OperatorGroupConstants.AGGREGATE_GROUP, - inputPorts = List( - InputPort(PortIdentity()) - ), - outputPorts = List( - OutputPort(PortIdentity()) - ) + inputPorts = List(InputPort()), + outputPorts = List(OutputPort()) ) - - override def getOutputSchema(schemas: Array[Schema]): Schema = { - if ( - aggregations.exists(agg => agg.resultAttribute == null || agg.resultAttribute.trim.isEmpty) - ) { - return null - } - if (groupByKeys == null) groupByKeys = List() - Schema - .builder() - .add( - Schema - .builder() - .add(groupByKeys.map(key => schemas(0).getAttribute(key)): _*) - .build() - ) - .add( - aggregations.map(agg => - agg.getAggregationAttribute(schemas(0).getAttribute(agg.attribute).getType) - ) - ) - .build() - } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/cartesianProduct/CartesianProductOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/cartesianProduct/CartesianProductOpDesc.scala index 7e71d29b42b..c17a94e3a40 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/cartesianProduct/CartesianProductOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/cartesianProduct/CartesianProductOpDesc.scala @@ -22,16 +22,49 @@ class CartesianProductOpDesc extends LogicalOp { .withInputPorts(operatorInfo.inputPorts) .withOutputPorts(operatorInfo.outputPorts) .withPropagateSchema( - SchemaPropagationFunc(inputSchemas => - Map( - operatorInfo.outputPorts.head.id -> getOutputSchema( - Array( - inputSchemas(operatorInfo.inputPorts.head.id), - inputSchemas(operatorInfo.inputPorts.last.id) - ) - ) - ) - ) + SchemaPropagationFunc(inputSchemas => { + + // Combines the left and right input schemas into a single output schema. + // + // - The output schema includes all attributes from the left schema first, followed by + // attributes from the right schema. + // - Duplicate attribute names are resolved by appending an increasing suffix (e.g., `#@1`, `#@2`). + // - Attributes from the left schema retain their original names in the output schema. + // + // Example: + // Left schema: (dup, dup#@1, dup#@2) + // Right schema: (r1, r2, dup) + // Output schema: (dup, dup#@1, dup#@2, r1, r2, dup#@3) + // + // In this example, the last attribute from the right schema (`dup`) is renamed to `dup#@3` + // to avoid conflicts. + + val builder = Schema.builder() + val leftSchema = inputSchemas(operatorInfo.inputPorts.head.id) + val rightSchema = inputSchemas(operatorInfo.inputPorts.last.id) + val leftAttributeNames = leftSchema.getAttributeNames + val rightAttributeNames = rightSchema.getAttributeNames + builder.add(leftSchema) + rightSchema.getAttributes.foreach(attr => { + var newName = attr.getName + while ( + leftAttributeNames.contains(newName) || rightAttributeNames + .filterNot(attrName => attrName == attr.getName) + .contains(newName) + ) { + newName = s"$newName#@1" + } + if (newName == attr.getName) { + // non-duplicate attribute, add to builder as is + builder.add(attr) + } else { + // renamed the duplicate attribute, construct new Attribute + builder.add(new Attribute(newName, attr.getType)) + } + }) + val outputSchema = builder.build() + Map(operatorInfo.outputPorts.head.id -> outputSchema) + }) ) // TODO : refactor to parallelize this operator for better performance and scalability: // can consider hash partition on larger input, broadcast smaller table to each partition @@ -39,46 +72,6 @@ class CartesianProductOpDesc extends LogicalOp { } - /** - * returns a Schema in order of the left input attributes followed by the right attributes - * duplicate attribute names are handled with an increasing suffix count - * - * Left schema attributes should always retain the same name in output schema - * - * For example, Left(dup, dup#@1, dup#@2) cartesian product with Right(r1, r2, dup) - * has output schema: (dup, dup#@1, dup#@2, r1, r2, dup#@3) - * - * Since the last attribute of Right is a duplicate, it increases suffix until it is - * no longer a duplicate, resulting in dup#@3 - */ - def getOutputSchemaInternal(schemas: Array[Schema]): Schema = { - // merge left / right schemas together, sequentially with left schema first - val builder = Schema.builder() - val leftSchema = schemas(0) - val leftAttributeNames = leftSchema.getAttributeNames - val rightSchema = schemas(1) - val rightAttributeNames = rightSchema.getAttributeNames - builder.add(leftSchema) - rightSchema.getAttributes.foreach(attr => { - var newName = attr.getName - while ( - leftAttributeNames.contains(newName) || rightAttributeNames - .filterNot(attrName => attrName == attr.getName) - .contains(newName) - ) { - newName = s"$newName#@1" - } - if (newName == attr.getName) { - // non-duplicate attribute, add to builder as is - builder.add(attr) - } else { - // renamed the duplicate attribute, construct new Attribute - builder.add(new Attribute(newName, attr.getType)) - } - }) - builder.build() - } - override def operatorInfo: OperatorInfo = OperatorInfo( "Cartesian Product", @@ -90,9 +83,4 @@ class CartesianProductOpDesc extends LogicalOp { ), outputPorts = List(OutputPort()) ) - - // remove duplicates in attribute names - override def getOutputSchema(schemas: Array[Schema]): Schema = { - getOutputSchemaInternal(schemas) - } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/dictionary/DictionaryMatcherOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/dictionary/DictionaryMatcherOpDesc.scala index 4a2cb463355..2a82b03d10b 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/dictionary/DictionaryMatcherOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/dictionary/DictionaryMatcherOpDesc.scala @@ -1,16 +1,14 @@ package edu.uci.ics.amber.operator.dictionary import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} -import com.google.common.base.Preconditions import edu.uci.ics.amber.core.executor.OpExecWithClassName import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} -import edu.uci.ics.amber.core.workflow.{PhysicalOp, SchemaPropagationFunc} +import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} +import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.operator.map.MapOpDesc import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.util.JSONUtils.objectMapper -import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} /** * Dictionary matcher operator matches a tuple if the specified column is in the given dictionary. @@ -48,9 +46,16 @@ class DictionaryMatcherOpDesc extends MapOpDesc { .withInputPorts(operatorInfo.inputPorts) .withOutputPorts(operatorInfo.outputPorts) .withPropagateSchema( - SchemaPropagationFunc(inputSchemas => - Map(operatorInfo.outputPorts.head.id -> getOutputSchema(inputSchemas.values.toArray)) - ) + SchemaPropagationFunc(inputSchemas => { + if (resultAttribute == null || resultAttribute.trim.isEmpty) return null + Map( + operatorInfo.outputPorts.head.id -> Schema + .builder() + .add(inputSchemas.values.head) + .add(resultAttribute, AttributeType.BOOLEAN) + .build() + ) + }) ) } @@ -63,10 +68,4 @@ class DictionaryMatcherOpDesc extends MapOpDesc { outputPorts = List(OutputPort()), supportReconfiguration = true ) - - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Preconditions.checkArgument(schemas.length == 1) - if (resultAttribute == null || resultAttribute.trim.isEmpty) return null - Schema.builder().add(schemas(0)).add(resultAttribute, AttributeType.BOOLEAN).build() - } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/difference/DifferenceOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/difference/DifferenceOpDesc.scala index 8c144b3756a..8cb81f186c3 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/difference/DifferenceOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/difference/DifferenceOpDesc.scala @@ -2,12 +2,10 @@ package edu.uci.ics.amber.operator.difference import com.google.common.base.Preconditions import edu.uci.ics.amber.core.executor.OpExecWithClassName -import edu.uci.ics.amber.core.tuple.Schema -import edu.uci.ics.amber.core.workflow.{HashPartition, PhysicalOp} +import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} +import edu.uci.ics.amber.core.workflow._ import edu.uci.ics.amber.operator.LogicalOp import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} -import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} class DifferenceOpDesc extends LogicalOp { @@ -26,6 +24,11 @@ class DifferenceOpDesc extends LogicalOp { .withOutputPorts(operatorInfo.outputPorts) .withPartitionRequirement(List(Option(HashPartition()), Option(HashPartition()))) .withDerivePartition(_ => HashPartition()) + .withPropagateSchema(SchemaPropagationFunc(inputSchemas => { + Preconditions.checkArgument(inputSchemas.values.toSet.size == 1) + val outputSchema = inputSchemas.values.head + operatorInfo.outputPorts.map(port => port.id -> outputSchema).toMap + })) } override def operatorInfo: OperatorInfo = @@ -39,9 +42,4 @@ class DifferenceOpDesc extends LogicalOp { ), outputPorts = List(OutputPort(blocking = true)) ) - - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Preconditions.checkArgument(schemas.forall(_ == schemas(0))) - schemas(0) - } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/distinct/DistinctOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/distinct/DistinctOpDesc.scala index 30c2f9f4b27..7f851743b18 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/distinct/DistinctOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/distinct/DistinctOpDesc.scala @@ -1,13 +1,10 @@ package edu.uci.ics.amber.operator.distinct -import com.google.common.base.Preconditions import edu.uci.ics.amber.core.executor.OpExecWithClassName -import edu.uci.ics.amber.core.tuple.Schema -import edu.uci.ics.amber.core.workflow.{HashPartition, PhysicalOp} +import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} +import edu.uci.ics.amber.core.workflow.{HashPartition, InputPort, OutputPort, PhysicalOp} import edu.uci.ics.amber.operator.LogicalOp import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} -import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} class DistinctOpDesc extends LogicalOp { @@ -26,6 +23,7 @@ class DistinctOpDesc extends LogicalOp { .withOutputPorts(operatorInfo.outputPorts) .withPartitionRequirement(List(Option(HashPartition()))) .withDerivePartition(_ => HashPartition()) + } override def operatorInfo: OperatorInfo = @@ -37,9 +35,4 @@ class DistinctOpDesc extends LogicalOp { outputPorts = List(OutputPort(blocking = true)) ) - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Preconditions.checkArgument(schemas.forall(_ == schemas(0))) - schemas(0) - } - } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/dummy/DummyOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/dummy/DummyOpDesc.scala index 75ce5a933cd..8cdb0d5a5b5 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/dummy/DummyOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/dummy/DummyOpDesc.scala @@ -2,7 +2,6 @@ package edu.uci.ics.amber.operator.dummy import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle -import edu.uci.ics.amber.core.tuple.Schema import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.{LogicalOp, PortDescription, PortDescriptor} import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} @@ -48,6 +47,4 @@ class DummyOpDesc extends LogicalOp with PortDescriptor { allowPortCustomization = true ) } - - override def getOutputSchema(schemas: Array[Schema]): Schema = schemas(0) } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/filter/FilterOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/filter/FilterOpDesc.scala index 28c5e44a981..52f66143137 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/filter/FilterOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/filter/FilterOpDesc.scala @@ -1,20 +1,13 @@ package edu.uci.ics.amber.operator.filter -import com.google.common.base.Preconditions -import edu.uci.ics.amber.core.tuple.Schema +import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import edu.uci.ics.amber.core.workflow.PhysicalOp import edu.uci.ics.amber.operator.{LogicalOp, StateTransferFunc} -import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import scala.util.{Success, Try} abstract class FilterOpDesc extends LogicalOp { - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Preconditions.checkArgument(schemas.length == 1) - schemas(0) - } - override def runtimeReconfiguration( workflowId: WorkflowIdentity, executionId: ExecutionIdentity, diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/hashJoin/HashJoinOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/hashJoin/HashJoinOpDesc.scala index 3777b2b6216..756f468f46d 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/hashJoin/HashJoinOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/hashJoin/HashJoinOpDesc.scala @@ -118,13 +118,32 @@ class HashJoinOpDesc[K] extends LogicalOp { .withDerivePartition(_ => HashPartition(List(probeAttributeName))) .withParallelizable(true) .withPropagateSchema( - SchemaPropagationFunc(inputSchemas => - Map( - PortIdentity() -> getOutputSchema( - Array(inputSchemas(PortIdentity(internal = true)), inputSchemas(PortIdentity(1))) - ) - ) - ) + SchemaPropagationFunc(inputSchemas => { + val buildSchema = inputSchemas(PortIdentity(internal = true)) + val probeSchema = inputSchemas(PortIdentity(1)) + val builder = Schema.builder() + builder.add(buildSchema) + builder.removeIfExists(HASH_JOIN_INTERNAL_KEY_NAME) + val leftAttributeNames = buildSchema.getAttributeNames + val rightAttributeNames = + probeSchema.getAttributeNames.filterNot(name => name == probeAttributeName) + + // Create a Map from rightTuple's fields, renaming conflicts + rightAttributeNames + .foreach { name => + var newName = name + while ( + leftAttributeNames.contains(newName) || rightAttributeNames + .filter(attrName => name != attrName) + .contains(newName) + ) { + newName = s"$newName#@1" + } + builder.add(new Attribute(newName, probeSchema.getAttribute(name).getType)) + } + val outputSchema = builder.build() + Map(PortIdentity() -> outputSchema) + }) ) PhysicalPlan( @@ -151,31 +170,4 @@ class HashJoinOpDesc[K] extends LogicalOp { ), outputPorts = List(OutputPort()) ) - - // remove the probe attribute in the output - override def getOutputSchema(schemas: Array[Schema]): Schema = { - val buildSchema = schemas(0) - val probeSchema = schemas(1) - val builder = Schema.builder() - builder.add(buildSchema) - builder.removeIfExists(HASH_JOIN_INTERNAL_KEY_NAME) - val leftAttributeNames = buildSchema.getAttributeNames - val rightAttributeNames = - probeSchema.getAttributeNames.filterNot(name => name == probeAttributeName) - - // Create a Map from rightTuple's fields, renaming conflicts - rightAttributeNames - .foreach { name => - var newName = name - while ( - leftAttributeNames.contains(newName) || rightAttributeNames - .filter(attrName => name != attrName) - .contains(newName) - ) { - newName = s"$newName#@1" - } - builder.add(new Attribute(newName, probeSchema.getAttribute(name).getType)) - } - builder.build() - } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/huggingFace/HuggingFaceIrisLogisticRegressionOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/huggingFace/HuggingFaceIrisLogisticRegressionOpDesc.scala index a4efb8226cb..dcef5abf438 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/huggingFace/HuggingFaceIrisLogisticRegressionOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/huggingFace/HuggingFaceIrisLogisticRegressionOpDesc.scala @@ -5,7 +5,7 @@ import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} import edu.uci.ics.amber.operator.PythonOperatorDescriptor import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} +import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} class HuggingFaceIrisLogisticRegressionOpDesc extends PythonOperatorDescriptor { @JsonProperty(value = "petalLengthCmAttribute", required = true) @@ -90,17 +90,21 @@ class HuggingFaceIrisLogisticRegressionOpDesc extends PythonOperatorDescriptor { outputPorts = List(OutputPort()) ) - override def getOutputSchema(schemas: Array[Schema]): Schema = { + override def getOutputSchemas( + inputSchemas: Map[PortIdentity, Schema] + ): Map[PortIdentity, Schema] = { if ( predictionClassName == null || predictionClassName.trim.isEmpty || predictionProbabilityName == null || predictionProbabilityName.trim.isEmpty ) throw new RuntimeException("Result attribute name should not be empty") - Schema - .builder() - .add(schemas(0)) - .add(predictionClassName, AttributeType.STRING) - .add(predictionProbabilityName, AttributeType.DOUBLE) - .build() + Map( + operatorInfo.outputPorts.head.id -> Schema + .builder() + .add(inputSchemas(operatorInfo.inputPorts.head.id)) + .add(predictionClassName, AttributeType.STRING) + .add(predictionProbabilityName, AttributeType.DOUBLE) + .build() + ) } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/huggingFace/HuggingFaceSentimentAnalysisOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/huggingFace/HuggingFaceSentimentAnalysisOpDesc.scala index 04a603ed85c..5e9027951a9 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/huggingFace/HuggingFaceSentimentAnalysisOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/huggingFace/HuggingFaceSentimentAnalysisOpDesc.scala @@ -2,7 +2,7 @@ package edu.uci.ics.amber.operator.huggingFace import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} +import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} import edu.uci.ics.amber.operator.PythonOperatorDescriptor import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName @@ -77,19 +77,23 @@ class HuggingFaceSentimentAnalysisOpDesc extends PythonOperatorDescriptor { supportReconfiguration = true ) - override def getOutputSchema(schemas: Array[Schema]): Schema = { + override def getOutputSchemas( + inputSchemas: Map[PortIdentity, Schema] + ): Map[PortIdentity, Schema] = { if ( resultAttributePositive == null || resultAttributePositive.trim.isEmpty || resultAttributeNeutral == null || resultAttributeNeutral.trim.isEmpty || resultAttributeNegative == null || resultAttributeNegative.trim.isEmpty ) return null - Schema - .builder() - .add(schemas(0)) - .add(resultAttributePositive, AttributeType.DOUBLE) - .add(resultAttributeNeutral, AttributeType.DOUBLE) - .add(resultAttributeNegative, AttributeType.DOUBLE) - .build() + Map( + operatorInfo.outputPorts.head.id -> Schema + .builder() + .add(inputSchemas(operatorInfo.inputPorts.head.id)) + .add(resultAttributePositive, AttributeType.DOUBLE) + .add(resultAttributeNeutral, AttributeType.DOUBLE) + .add(resultAttributeNegative, AttributeType.DOUBLE) + .build() + ) } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/huggingFace/HuggingFaceSpamSMSDetectionOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/huggingFace/HuggingFaceSpamSMSDetectionOpDesc.scala index cf1c43dd701..4257c17a6d5 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/huggingFace/HuggingFaceSpamSMSDetectionOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/huggingFace/HuggingFaceSpamSMSDetectionOpDesc.scala @@ -5,7 +5,7 @@ import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} import edu.uci.ics.amber.operator.PythonOperatorDescriptor import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} +import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} class HuggingFaceSpamSMSDetectionOpDesc extends PythonOperatorDescriptor { @JsonProperty(value = "attribute", required = true) @JsonPropertyDescription("column to perform spam detection on") @@ -54,12 +54,16 @@ class HuggingFaceSpamSMSDetectionOpDesc extends PythonOperatorDescriptor { outputPorts = List(OutputPort()) ) - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Schema - .builder() - .add(schemas(0)) - .add(resultAttributeSpam, AttributeType.BOOLEAN) - .add(resultAttributeProbability, AttributeType.DOUBLE) - .build() + override def getOutputSchemas( + inputSchemas: Map[PortIdentity, Schema] + ): Map[PortIdentity, Schema] = { + Map( + operatorInfo.outputPorts.head.id -> Schema + .builder() + .add(inputSchemas.values.head) + .add(resultAttributeSpam, AttributeType.BOOLEAN) + .add(resultAttributeProbability, AttributeType.DOUBLE) + .build() + ) } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/huggingFace/HuggingFaceTextSummarizationOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/huggingFace/HuggingFaceTextSummarizationOpDesc.scala index 349842369fb..e79369fb959 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/huggingFace/HuggingFaceTextSummarizationOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/huggingFace/HuggingFaceTextSummarizationOpDesc.scala @@ -2,7 +2,7 @@ package edu.uci.ics.amber.operator.huggingFace import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} +import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} import edu.uci.ics.amber.operator.PythonOperatorDescriptor import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} @@ -57,13 +57,17 @@ class HuggingFaceTextSummarizationOpDesc extends PythonOperatorDescriptor { outputPorts = List(OutputPort()) ) - override def getOutputSchema(schemas: Array[Schema]): Schema = { + override def getOutputSchemas( + inputSchemas: Map[PortIdentity, Schema] + ): Map[PortIdentity, Schema] = { if (resultAttribute == null || resultAttribute.trim.isEmpty) throw new RuntimeException("Result attribute name should be given") - Schema - .builder() - .add(schemas(0)) - .add(resultAttribute, AttributeType.STRING) - .build() + Map( + operatorInfo.outputPorts.head.id -> Schema + .builder() + .add(inputSchemas.values.head) + .add(resultAttribute, AttributeType.STRING) + .build() + ) } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/intersect/IntersectOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/intersect/IntersectOpDesc.scala index 1de8534ac11..48cc74ea8a7 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/intersect/IntersectOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/intersect/IntersectOpDesc.scala @@ -1,13 +1,10 @@ package edu.uci.ics.amber.operator.intersect -import com.google.common.base.Preconditions import edu.uci.ics.amber.core.executor.OpExecWithClassName -import edu.uci.ics.amber.core.tuple.Schema -import edu.uci.ics.amber.core.workflow.{HashPartition, PhysicalOp} +import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} +import edu.uci.ics.amber.core.workflow._ import edu.uci.ics.amber.operator.LogicalOp import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} -import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} class IntersectOpDesc extends LogicalOp { @@ -36,10 +33,4 @@ class IntersectOpDesc extends LogicalOp { inputPorts = List(InputPort(PortIdentity()), InputPort(PortIdentity(1))), outputPorts = List(OutputPort(blocking = true)) ) - - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Preconditions.checkArgument(schemas.forall(_ == schemas(0))) - schemas(0) - } - } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/intervalJoin/IntervalJoinOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/intervalJoin/IntervalJoinOpDesc.scala index 985b8f9c4d6..764a42b2708 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/intervalJoin/IntervalJoinOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/intervalJoin/IntervalJoinOpDesc.scala @@ -91,16 +91,22 @@ class IntervalJoinOpDesc extends LogicalOp { .withInputPorts(operatorInfo.inputPorts) .withOutputPorts(operatorInfo.outputPorts) .withPropagateSchema( - SchemaPropagationFunc(inputSchemas => - Map( - operatorInfo.outputPorts.head.id -> getOutputSchema( - Array( - inputSchemas(operatorInfo.inputPorts.head.id), - inputSchemas(operatorInfo.inputPorts.last.id) - ) - ) - ) - ) + SchemaPropagationFunc(inputSchemas => { + val builder: Schema.Builder = Schema.builder() + val leftTableSchema: Schema = inputSchemas(operatorInfo.inputPorts.head.id) + val rightTableSchema: Schema = inputSchemas(operatorInfo.inputPorts.last.id) + builder.add(leftTableSchema) + rightTableSchema.getAttributes + .map(attr => { + if (leftTableSchema.containsAttribute(attr.getName)) { + builder.add(new Attribute(s"${attr.getName}#@1", attr.getType)) + } else { + builder.add(attr.getName, attr.getType) + } + }) + val outputSchema = builder.build() + Map(operatorInfo.outputPorts.head.id -> outputSchema) + }) ) .withPartitionRequirement(partitionRequirement) } @@ -138,20 +144,4 @@ class IntervalJoinOpDesc extends LogicalOp { this.timeIntervalType = Some(timeIntervalType) } - override def getOutputSchema(schemas: Array[Schema]): Schema = { - val builder: Schema.Builder = Schema.builder() - val leftTableSchema: Schema = schemas(0) - val rightTableSchema: Schema = schemas(1) - builder.add(leftTableSchema) - rightTableSchema.getAttributes - .map(attr => { - if (leftTableSchema.containsAttribute(attr.getName)) { - builder.add(new Attribute(s"${attr.getName}#@1", attr.getType)) - } else { - builder.add(attr.getName, attr.getType) - } - }) - builder.build() - } - } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/limit/LimitOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/limit/LimitOpDesc.scala index 70ebe4725f4..ae0d7768bb1 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/limit/LimitOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/limit/LimitOpDesc.scala @@ -3,7 +3,6 @@ package edu.uci.ics.amber.operator.limit import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle import edu.uci.ics.amber.core.executor.OpExecWithClassName -import edu.uci.ics.amber.core.tuple.Schema import edu.uci.ics.amber.core.workflow.PhysicalOp import edu.uci.ics.amber.operator.{LogicalOp, StateTransferFunc} import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} @@ -49,8 +48,6 @@ class LimitOpDesc extends LogicalOp { supportReconfiguration = true ) - override def getOutputSchema(schemas: Array[Schema]): Schema = schemas(0) - override def runtimeReconfiguration( workflowId: WorkflowIdentity, executionId: ExecutionIdentity, diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/machineLearning/Scorer/MachineLearningScorerOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/machineLearning/Scorer/MachineLearningScorerOpDesc.scala index 8183cf14e4c..62ca41b34eb 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/machineLearning/Scorer/MachineLearningScorerOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/machineLearning/Scorer/MachineLearningScorerOpDesc.scala @@ -10,7 +10,7 @@ import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} import edu.uci.ics.amber.operator.PythonOperatorDescriptor import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.metadata.annotations.{AutofillAttributeName, HideAnnotation} -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} +import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} class MachineLearningScorerOpDesc extends PythonOperatorDescriptor { @JsonProperty(required = true, defaultValue = "false") @@ -64,7 +64,9 @@ class MachineLearningScorerOpDesc extends PythonOperatorDescriptor { inputPorts = List(InputPort()), outputPorts = List(OutputPort()) ) - override def getOutputSchema(schemas: Array[Schema]): Schema = { + override def getOutputSchemas( + inputSchemas: Map[PortIdentity, Schema] + ): Map[PortIdentity, Schema] = { val outputSchemaBuilder = Schema.builder() if (!isRegression) { outputSchemaBuilder.add(new Attribute("Class", AttributeType.STRING)) @@ -79,7 +81,7 @@ class MachineLearningScorerOpDesc extends PythonOperatorDescriptor { outputSchemaBuilder.add(new Attribute(metricName, AttributeType.DOUBLE)) }) - outputSchemaBuilder.build() + Map(operatorInfo.outputPorts.head.id -> outputSchemaBuilder.build()) } // private def getClassificationScorerName(scorer: classificationMetricsFnc): String = { diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/machineLearning/sklearnAdvanced/base/SklearnAdvancedBaseDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/machineLearning/sklearnAdvanced/base/SklearnAdvancedBaseDesc.scala index 66467291eb0..0d35b6cbc85 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/machineLearning/sklearnAdvanced/base/SklearnAdvancedBaseDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/machineLearning/sklearnAdvanced/base/SklearnAdvancedBaseDesc.scala @@ -149,9 +149,13 @@ abstract class SklearnMLOperatorDescriptor[T <: ParamClass] extends PythonOperat ) } - override def getOutputSchema(schemas: Array[Schema]): Schema = { + override def getOutputSchemas( + inputSchemas: Map[PortIdentity, Schema] + ): Map[PortIdentity, Schema] = { val outputSchemaBuilder = Schema.builder() outputSchemaBuilder.add(new Attribute("Model", AttributeType.BINARY)) - outputSchemaBuilder.add(new Attribute("Parameters", AttributeType.STRING)).build() + outputSchemaBuilder.add(new Attribute("Parameters", AttributeType.STRING)) + + Map(operatorInfo.outputPorts.head.id -> outputSchemaBuilder.build()) } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/map/MapOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/map/MapOpDesc.scala index 5cce0ad9fb3..f47aca589be 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/map/MapOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/map/MapOpDesc.scala @@ -4,7 +4,7 @@ import edu.uci.ics.amber.core.workflow.PhysicalOp import edu.uci.ics.amber.operator.{LogicalOp, StateTransferFunc} import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} -import scala.util.{Failure, Success, Try} +import scala.util.{Success, Try} abstract class MapOpDesc extends LogicalOp { @@ -14,22 +14,6 @@ abstract class MapOpDesc extends LogicalOp { oldOpDesc: LogicalOp, newOpDesc: LogicalOp ): Try[(PhysicalOp, Option[StateTransferFunc])] = { - val inputSchemas = oldOpDesc.operatorInfo.inputPorts - .map(inputPort => oldOpDesc.inputPortToSchemaMapping(inputPort.id)) - .toArray - val outputSchemas = oldOpDesc.operatorInfo.outputPorts - .map(outputPort => oldOpDesc.outputPortToSchemaMapping(outputPort.id)) - .toArray - val newOutputSchema = newOpDesc.getOutputSchema(inputSchemas) - if (!newOutputSchema.equals(outputSchemas.head)) { - Failure( - new UnsupportedOperationException( - "reconfigurations that change output schema are not supported" - ) - ) - } else { - Success(newOpDesc.getPhysicalOp(workflowId, executionId), None) - } + Success(newOpDesc.getPhysicalOp(workflowId, executionId), None) } - } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/projection/ProjectionOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/projection/ProjectionOpDesc.scala index 47b80cfaef0..39183a07ea5 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/projection/ProjectionOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/projection/ProjectionOpDesc.scala @@ -39,10 +39,26 @@ class ProjectionOpDesc extends MapOpDesc { .withOutputPorts(operatorInfo.outputPorts) .withDerivePartition(derivePartition()) .withPropagateSchema(SchemaPropagationFunc(inputSchemas => { + Preconditions.checkArgument(attributes.nonEmpty) + val inputSchema = inputSchemas.values.head + val outputSchema = if (!isDrop) { + Schema + .builder() + .add(attributes.map { attribute => + val originalType = inputSchema.getAttribute(attribute.getOriginalAttribute).getType + new Attribute(attribute.getAlias, originalType) + }) + .build() + } else { + val outputSchemaBuilder = Schema.builder() + outputSchemaBuilder.add(inputSchema) + for (attribute <- attributes) { + outputSchemaBuilder.removeIfExists(attribute.getOriginalAttribute) + } + outputSchemaBuilder.build() + } Map( - operatorInfo.outputPorts.head.id -> getOutputSchema( - Array(inputSchemas(operatorInfo.inputPorts.head.id)) - ) + operatorInfo.outputPorts.head.id -> outputSchema ) })) } @@ -71,28 +87,4 @@ class ProjectionOpDesc extends MapOpDesc { outputPorts = List(OutputPort()) ) } - - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Preconditions.checkArgument(schemas.length == 1) - Preconditions.checkArgument(attributes.nonEmpty) - if (!isDrop) { - Schema - .builder() - .add(attributes.map { attribute => - val originalType = schemas.head.getAttribute(attribute.getOriginalAttribute).getType - new Attribute(attribute.getAlias, originalType) - }) - .build() - } else { - val outputSchemaBuilder = Schema.builder() - val inputSchema = schemas(0) - outputSchemaBuilder.add(inputSchema) - for (attribute <- attributes) { - outputSchemaBuilder.removeIfExists(attribute.getOriginalAttribute) - } - outputSchemaBuilder.build() - - } - - } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/reservoirsampling/ReservoirSamplingOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/reservoirsampling/ReservoirSamplingOpDesc.scala index cc1840609bf..cc65876a2f5 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/reservoirsampling/ReservoirSamplingOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/reservoirsampling/ReservoirSamplingOpDesc.scala @@ -1,9 +1,7 @@ package edu.uci.ics.amber.operator.reservoirsampling import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} -import com.google.common.base.Preconditions import edu.uci.ics.amber.core.executor.OpExecWithClassName -import edu.uci.ics.amber.core.tuple.Schema import edu.uci.ics.amber.core.workflow.PhysicalOp import edu.uci.ics.amber.operator.LogicalOp import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} @@ -44,9 +42,4 @@ class ReservoirSamplingOpDesc extends LogicalOp { outputPorts = List(OutputPort()) ) } - - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Preconditions.checkArgument(schemas.length == 1) - schemas(0) - } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sentiment/SentimentAnalysisOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sentiment/SentimentAnalysisOpDesc.scala index 815b08bdb2a..155380851ba 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sentiment/SentimentAnalysisOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sentiment/SentimentAnalysisOpDesc.scala @@ -55,9 +55,17 @@ class SentimentAnalysisOpDesc extends MapOpDesc { .withInputPorts(operatorInfo.inputPorts) .withOutputPorts(operatorInfo.outputPorts) .withPropagateSchema( - SchemaPropagationFunc(inputSchemas => - Map(operatorInfo.outputPorts.head.id -> getOutputSchema(inputSchemas.values.toArray)) - ) + SchemaPropagationFunc(inputSchemas => { + if (resultAttribute == null || resultAttribute.trim.isEmpty) + return null + Map( + operatorInfo.outputPorts.head.id -> Schema + .builder() + .add(inputSchemas.values.head) + .add(resultAttribute, AttributeType.INTEGER) + .build() + ) + }) ) } @@ -71,9 +79,4 @@ class SentimentAnalysisOpDesc extends MapOpDesc { supportReconfiguration = true ) - override def getOutputSchema(schemas: Array[Schema]): Schema = { - if (resultAttribute == null || resultAttribute.trim.isEmpty) - return null - Schema.builder().add(schemas(0)).add(resultAttribute, AttributeType.INTEGER).build() - } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sklearn/SklearnClassifierOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sklearn/SklearnClassifierOpDesc.scala index 09881901612..2279f4126dd 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sklearn/SklearnClassifierOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sklearn/SklearnClassifierOpDesc.scala @@ -106,11 +106,15 @@ abstract class SklearnClassifierOpDesc extends PythonOperatorDescriptor { outputPorts = List(OutputPort(blocking = true)) ) - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Schema - .builder() - .add("model_name", AttributeType.STRING) - .add("model", AttributeType.BINARY) - .build() + override def getOutputSchemas( + inputSchemas: Map[PortIdentity, Schema] + ): Map[PortIdentity, Schema] = { + Map( + operatorInfo.outputPorts.head.id -> Schema + .builder() + .add("model_name", AttributeType.STRING) + .add("model", AttributeType.BINARY) + .build() + ) } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sklearn/SklearnLinearRegressionOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sklearn/SklearnLinearRegressionOpDesc.scala index a55cb953395..35e0e7d4d9d 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sklearn/SklearnLinearRegressionOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sklearn/SklearnLinearRegressionOpDesc.scala @@ -59,12 +59,16 @@ class SklearnLinearRegressionOpDesc extends PythonOperatorDescriptor { outputPorts = List(OutputPort(blocking = true)) ) - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Schema - .builder() - .add("model_name", AttributeType.STRING) - .add("model", AttributeType.BINARY) - .build() + override def getOutputSchemas( + inputSchemas: Map[PortIdentity, Schema] + ): Map[PortIdentity, Schema] = { + Map( + operatorInfo.outputPorts.head.id -> Schema + .builder() + .add("model_name", AttributeType.STRING) + .add("model", AttributeType.BINARY) + .build() + ) } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sklearn/SklearnPredictionOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sklearn/SklearnPredictionOpDesc.scala index a1d4c86eb7e..6e3c8ae5cd7 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sklearn/SklearnPredictionOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sklearn/SklearnPredictionOpDesc.scala @@ -58,16 +58,21 @@ class SklearnPredictionOpDesc extends PythonOperatorDescriptor { outputPorts = List(OutputPort()) ) - override def getOutputSchema(schemas: Array[Schema]): Schema = { + override def getOutputSchemas( + inputSchemas: Map[PortIdentity, Schema] + ): Map[PortIdentity, Schema] = { var resultType = AttributeType.STRING + val inputSchema = inputSchemas(operatorInfo.inputPorts(1).id) if (groundTruthAttribute != "") { resultType = - schemas(1).attributes.find(attr => attr.getName == groundTruthAttribute).get.getType + inputSchema.attributes.find(attr => attr.getName == groundTruthAttribute).get.getType } - Schema - .builder() - .add(schemas(1)) - .add(resultAttribute, resultType) - .build() + Map( + operatorInfo.outputPorts.head.id -> Schema + .builder() + .add(inputSchema) + .add(resultAttribute, resultType) + .build() + ) } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sort/SortOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sort/SortOpDesc.scala index 39af6cd63a9..644ff0ff6cd 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sort/SortOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sort/SortOpDesc.scala @@ -2,7 +2,7 @@ package edu.uci.ics.amber.operator.sort import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import edu.uci.ics.amber.core.tuple.Schema -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} +import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} import edu.uci.ics.amber.operator.PythonOperatorDescriptor import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} class SortOpDesc extends PythonOperatorDescriptor { @@ -40,6 +40,10 @@ class SortOpDesc extends PythonOperatorDescriptor { | yield sorted_df""".stripMargin } + def getOutputSchemas(inputSchemas: Map[PortIdentity, Schema]): Map[PortIdentity, Schema] = { + Map(operatorInfo.outputPorts.head.id -> inputSchemas.values.head) + } + override def operatorInfo: OperatorInfo = OperatorInfo( "Sort", @@ -49,5 +53,4 @@ class SortOpDesc extends PythonOperatorDescriptor { outputPorts = List(OutputPort(blocking = true)) ) - override def getOutputSchema(schemas: Array[Schema]): Schema = schemas(0) } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sortPartitions/SortPartitionsOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sortPartitions/SortPartitionsOpDesc.scala index 6c06d95dadc..3d4809e34f5 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sortPartitions/SortPartitionsOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sortPartitions/SortPartitionsOpDesc.scala @@ -1,10 +1,8 @@ package edu.uci.ics.amber.operator.sortPartitions import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} -import com.google.common.base.Preconditions import com.kjetland.jackson.jsonSchema.annotations.{JsonSchemaInject, JsonSchemaTitle} import edu.uci.ics.amber.core.executor.OpExecWithClassName -import edu.uci.ics.amber.core.tuple.Schema import edu.uci.ics.amber.core.workflow.{PhysicalOp, RangePartition} import edu.uci.ics.amber.operator.LogicalOp import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName @@ -68,9 +66,4 @@ class SortPartitionsOpDesc extends LogicalOp { inputPorts = List(InputPort()), outputPorts = List(OutputPort(blocking = true)) ) - - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Preconditions.checkArgument(schemas.length == 1) - schemas(0) - } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/SourceOperatorDescriptor.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/SourceOperatorDescriptor.scala index ad36af84dbb..de87829cd38 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/SourceOperatorDescriptor.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/SourceOperatorDescriptor.scala @@ -1,15 +1,9 @@ package edu.uci.ics.amber.operator.source -import com.google.common.base.Preconditions import edu.uci.ics.amber.core.tuple.Schema import edu.uci.ics.amber.operator.LogicalOp abstract class SourceOperatorDescriptor extends LogicalOp { def sourceSchema(): Schema - - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Preconditions.checkArgument(schemas.isEmpty) - sourceSchema() - } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/apis/reddit/RedditSearchSourceOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/apis/reddit/RedditSearchSourceOpDesc.scala index bf2c9336d83..6213cc26ded 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/apis/reddit/RedditSearchSourceOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/apis/reddit/RedditSearchSourceOpDesc.scala @@ -5,7 +5,7 @@ import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.source.PythonSourceOperatorDescriptor -import edu.uci.ics.amber.core.workflow.OutputPort +import edu.uci.ics.amber.core.workflow.{OutputPort, PortIdentity} class RedditSearchSourceOpDesc extends PythonSourceOperatorDescriptor { @JsonProperty(required = true) @@ -134,4 +134,8 @@ class RedditSearchSourceOpDesc extends PythonSourceOperatorDescriptor { new Attribute("subreddit", AttributeType.STRING) ) .build() + + def getOutputSchemas(inputSchemas: Map[PortIdentity, Schema]): Map[PortIdentity, Schema] = { + Map(operatorInfo.outputPorts.head.id -> sourceSchema()) + } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/split/SplitOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/split/SplitOpDesc.scala index 134af1029cd..217bb161b2f 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/split/SplitOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/split/SplitOpDesc.scala @@ -3,13 +3,11 @@ package edu.uci.ics.amber.operator.split import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.google.common.base.Preconditions import edu.uci.ics.amber.core.executor.OpExecWithClassName -import edu.uci.ics.amber.core.tuple.Schema -import edu.uci.ics.amber.core.workflow.{PhysicalOp, SchemaPropagationFunc} +import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} +import edu.uci.ics.amber.core.workflow._ import edu.uci.ics.amber.operator.LogicalOp import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.util.JSONUtils.objectMapper -import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} import scala.util.Random class SplitOpDesc extends LogicalOp { @@ -40,12 +38,11 @@ class SplitOpDesc extends LogicalOp { .withOutputPorts(operatorInfo.outputPorts) .withParallelizable(false) .withPropagateSchema( - SchemaPropagationFunc(inputSchemas => - operatorInfo.outputPorts - .map(_.id) - .map(id => id -> inputSchemas(operatorInfo.inputPorts.head.id)) - .toMap - ) + SchemaPropagationFunc(inputSchemas => { + Preconditions.checkArgument(inputSchemas.size == 1) + val outputSchema = inputSchemas.values.head + operatorInfo.outputPorts.map(port => port.id -> outputSchema).toMap + }) ) } @@ -64,10 +61,4 @@ class SplitOpDesc extends LogicalOp { ) } - override def getOutputSchema(schemas: Array[Schema]): Schema = throw new NotImplementedError() - - override def getOutputSchemas(schemas: Array[Schema]): Array[Schema] = { - Preconditions.checkArgument(schemas.length == 1) - Array(schemas(0), schemas(0)) - } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/symmetricDifference/SymmetricDifferenceOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/symmetricDifference/SymmetricDifferenceOpDesc.scala index e77663fdf0b..3dc311f77d5 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/symmetricDifference/SymmetricDifferenceOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/symmetricDifference/SymmetricDifferenceOpDesc.scala @@ -2,12 +2,17 @@ package edu.uci.ics.amber.operator.symmetricDifference import com.google.common.base.Preconditions import edu.uci.ics.amber.core.executor.OpExecWithClassName -import edu.uci.ics.amber.core.tuple.Schema -import edu.uci.ics.amber.core.workflow.{HashPartition, PhysicalOp} +import edu.uci.ics.amber.core.workflow.{ + HashPartition, + InputPort, + OutputPort, + PhysicalOp, + PortIdentity, + SchemaPropagationFunc +} import edu.uci.ics.amber.operator.LogicalOp import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} class SymmetricDifferenceOpDesc extends LogicalOp { @@ -29,6 +34,11 @@ class SymmetricDifferenceOpDesc extends LogicalOp { .withOutputPorts(operatorInfo.outputPorts) .withPartitionRequirement(List(Option(HashPartition()), Option(HashPartition()))) .withDerivePartition(_ => HashPartition(List())) + .withPropagateSchema(SchemaPropagationFunc(inputSchemas => { + Preconditions.checkArgument(inputSchemas.values.toSet.size == 1) + val outputSchema = inputSchemas.values.head + operatorInfo.outputPorts.map(port => port.id -> outputSchema).toMap + })) } override def operatorInfo: OperatorInfo = @@ -40,9 +50,4 @@ class SymmetricDifferenceOpDesc extends LogicalOp { outputPorts = List(OutputPort(blocking = true)) ) - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Preconditions.checkArgument(schemas.forall(_ == schemas(0))) - schemas(0) - } - } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/typecasting/TypeCastingOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/typecasting/TypeCastingOpDesc.scala index b52f299c0ff..f5d502437b4 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/typecasting/TypeCastingOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/typecasting/TypeCastingOpDesc.scala @@ -54,10 +54,4 @@ class TypeCastingOpDesc extends MapOpDesc { List(OutputPort()) ) } - - override def getOutputSchema(schemas: Array[Schema]): Schema = { - typeCastingUnits.foldLeft(schemas.head) { (schema, unit) => - AttributeTypeUtils.SchemaCasting(schema, unit.attribute, unit.resultType) - } - } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/java/JavaUDFOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/java/JavaUDFOpDesc.scala index 9fe0089c4ba..fd38d176ae1 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/java/JavaUDFOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/java/JavaUDFOpDesc.scala @@ -157,26 +157,6 @@ class JavaUDFOpDesc extends LogicalOp { ) } - override def getOutputSchema(schemas: Array[Schema]): Schema = { - // Preconditions.checkArgument(schemas.length == 1) - val inputSchema = schemas(0) - val outputSchemaBuilder = Schema.Builder() - // keep the same schema from input - if (retainInputColumns) outputSchemaBuilder.add(inputSchema) - // for any javaUDFType, it can add custom output columns (attributes). - if (outputColumns != null) { - if (retainInputColumns) { // check if columns are duplicated - - for (column <- outputColumns) { - if (inputSchema.containsAttribute(column.getName)) - throw new RuntimeException("Column name " + column.getName + " already exists!") - } - } - outputSchemaBuilder.add(outputColumns) - } - outputSchemaBuilder.build() - } - override def runtimeReconfiguration( workflowId: WorkflowIdentity, executionId: ExecutionIdentity, diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/DualInputPortsPythonUDFOpDescV2.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/DualInputPortsPythonUDFOpDescV2.scala index 985fa54fede..a4af16c5415 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/DualInputPortsPythonUDFOpDescV2.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/DualInputPortsPythonUDFOpDescV2.scala @@ -64,7 +64,7 @@ class DualInputPortsPythonUDFOpDescV2 extends LogicalOp { executionId: ExecutionIdentity ): PhysicalOp = { Preconditions.checkArgument(workers >= 1, "Need at least 1 worker.", Array()) - if (workers > 1) { + val physicalOp = if (workers > 1) { PhysicalOp .oneToOnePhysicalOp( workflowId, @@ -72,15 +72,7 @@ class DualInputPortsPythonUDFOpDescV2 extends LogicalOp { operatorIdentifier, OpExecWithCode(code, "python") ) - .withDerivePartition(_ => UnknownPartition()) .withParallelizable(true) - .withInputPorts(operatorInfo.inputPorts) - .withOutputPorts(operatorInfo.outputPorts) - .withPropagateSchema( - SchemaPropagationFunc(inputSchemas => - Map(operatorInfo.outputPorts.head.id -> getOutputSchema(inputSchemas.values.toArray)) - ) - ) .withSuggestedWorkerNum(workers) } else { PhysicalOp @@ -90,20 +82,33 @@ class DualInputPortsPythonUDFOpDescV2 extends LogicalOp { operatorIdentifier, OpExecWithCode(code, "python") ) - .withDerivePartition(_ => UnknownPartition()) .withParallelizable(false) - .withInputPorts(operatorInfo.inputPorts) - .withOutputPorts(operatorInfo.outputPorts) - .withPropagateSchema( - SchemaPropagationFunc(inputSchemas => - Map( - operatorInfo.outputPorts.head.id -> getOutputSchema( - operatorInfo.inputPorts.map(_.id).map(inputSchemas(_)).toArray - ) - ) - ) - ) } + physicalOp + .withDerivePartition(_ => UnknownPartition()) + .withInputPorts(operatorInfo.inputPorts) + .withOutputPorts(operatorInfo.outputPorts) + .withPropagateSchema( + SchemaPropagationFunc(inputSchemas => { + Preconditions.checkArgument(inputSchemas.size == 2) + val inputSchema = inputSchemas(operatorInfo.inputPorts(1).id) + val outputSchemaBuilder = Schema.builder() + // keep the same schema from input + if (retainInputColumns) outputSchemaBuilder.add(inputSchema) + // for any pythonUDFType, it can add custom output columns (attributes). + if (outputColumns != null) { + if (retainInputColumns) { // check if columns are duplicated + + for (column <- outputColumns) { + if (inputSchema.containsAttribute(column.getName)) + throw new RuntimeException("Column name " + column.getName + " already exists!") + } + } + outputSchemaBuilder.add(outputColumns).build() + } + Map(operatorInfo.outputPorts.head.id -> outputSchemaBuilder.build()) + }) + ) } override def operatorInfo: OperatorInfo = @@ -123,23 +128,4 @@ class DualInputPortsPythonUDFOpDescV2 extends LogicalOp { outputPorts = List(OutputPort()) ) - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Preconditions.checkArgument(schemas.length == 2) - val inputSchema = schemas(1) - val outputSchemaBuilder = Schema.builder() - // keep the same schema from input - if (retainInputColumns) outputSchemaBuilder.add(inputSchema) - // for any pythonUDFType, it can add custom output columns (attributes). - if (outputColumns != null) { - if (retainInputColumns) { // check if columns are duplicated - - for (column <- outputColumns) { - if (inputSchema.containsAttribute(column.getName)) - throw new RuntimeException("Column name " + column.getName + " already exists!") - } - } - outputSchemaBuilder.add(outputColumns).build() - } - outputSchemaBuilder.build() - } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/PythonLambdaFunctionOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/PythonLambdaFunctionOpDesc.scala index aa016c2740e..056326c8093 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/PythonLambdaFunctionOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/PythonLambdaFunctionOpDesc.scala @@ -5,16 +5,18 @@ import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle import edu.uci.ics.amber.core.tuple.{AttributeTypeUtils, Schema} import edu.uci.ics.amber.operator.PythonOperatorDescriptor import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} +import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} class PythonLambdaFunctionOpDesc extends PythonOperatorDescriptor { @JsonSchemaTitle("Add/Modify column(s)") var lambdaAttributeUnits: List[LambdaAttributeUnit] = List() - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Preconditions.checkArgument(schemas.length == 1) + override def getOutputSchemas( + inputSchemas: Map[PortIdentity, Schema] + ): Map[PortIdentity, Schema] = { + Preconditions.checkArgument(inputSchemas.size == 1) Preconditions.checkArgument(lambdaAttributeUnits.nonEmpty) - val inputSchema = schemas(0) + val inputSchema = inputSchemas.values.head val outputSchemaBuilder = Schema.builder() // keep the same schema from input outputSchemaBuilder.add(inputSchema) @@ -37,7 +39,8 @@ class PythonLambdaFunctionOpDesc extends PythonOperatorDescriptor { outputSchema = AttributeTypeUtils.SchemaCasting(outputSchema, unit.attributeName, unit.attributeType) } - outputSchema + Map(operatorInfo.outputPorts.head.id -> outputSchema) + } override def operatorInfo: OperatorInfo = diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/PythonTableReducerOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/PythonTableReducerOpDesc.scala index aa36eaced06..0f6d1988de5 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/PythonTableReducerOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/PythonTableReducerOpDesc.scala @@ -3,21 +3,23 @@ package edu.uci.ics.amber.operator.udf.python import com.google.common.base.Preconditions import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle import edu.uci.ics.amber.core.tuple.Schema +import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} import edu.uci.ics.amber.operator.PythonOperatorDescriptor import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} class PythonTableReducerOpDesc extends PythonOperatorDescriptor { @JsonSchemaTitle("Output columns") var lambdaAttributeUnits: List[LambdaAttributeUnit] = List() - override def getOutputSchema(schemas: Array[Schema]): Schema = { + override def getOutputSchemas( + inputSchemas: Map[PortIdentity, Schema] + ): Map[PortIdentity, Schema] = { Preconditions.checkArgument(lambdaAttributeUnits.nonEmpty) val outputSchemaBuilder = Schema.builder() for (unit <- lambdaAttributeUnits) { outputSchemaBuilder.add(unit.attributeName, unit.attributeType) } - outputSchemaBuilder.build() + Map(operatorInfo.outputPorts.head.id -> outputSchemaBuilder.build()) } override def operatorInfo: OperatorInfo = @@ -30,17 +32,17 @@ class PythonTableReducerOpDesc extends PythonOperatorDescriptor { ) override def generatePythonCode(): String = { - var outputTable = "{" - for (unit <- lambdaAttributeUnits) { - outputTable += s"""\"${unit.attributeName}\":${unit.expression},""" - } - outputTable += "}" + val outputTable = lambdaAttributeUnits + .map(unit => s"""\"${unit.attributeName}\": ${unit.expression}""") + .mkString("{", ", ", "}") + s""" -from pytexera import * -class ProcessTableOperator(UDFTableOperator): - @overrides - def process_table(self, table: Table, port: int) -> Iterator[Optional[TableLike]]: - yield $outputTable -""" + |from pytexera import * + |class ProcessTableOperator(UDFTableOperator): + | + | @overrides + | def process_table(self, table: Table, port: int) -> Iterator[Optional[TableLike]]: + | yield $outputTable + |""".stripMargin } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/PythonUDFOpDescV2.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/PythonUDFOpDescV2.scala index 1f9b69eb326..216284315cb 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/PythonUDFOpDescV2.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/PythonUDFOpDescV2.scala @@ -5,16 +5,10 @@ import com.google.common.base.Preconditions import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle import edu.uci.ics.amber.core.executor.OpExecWithCode import edu.uci.ics.amber.core.tuple.{Attribute, Schema} -import edu.uci.ics.amber.core.workflow.{ - PartitionInfo, - PhysicalOp, - SchemaPropagationFunc, - UnknownPartition -} -import edu.uci.ics.amber.operator.{LogicalOp, PortDescription, StateTransferFunc} -import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} +import edu.uci.ics.amber.core.workflow._ +import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} +import edu.uci.ics.amber.operator.{LogicalOp, PortDescription, StateTransferFunc} import scala.util.{Success, Try} @@ -98,7 +92,7 @@ class PythonUDFOpDescV2 extends LogicalOp { Map(operatorInfo.outputPorts.head.id -> outputSchemaBuilder.build()) } - if (workers > 1) + if (workers > 1) { PhysicalOp .oneToOnePhysicalOp( workflowId, @@ -106,15 +100,9 @@ class PythonUDFOpDescV2 extends LogicalOp { operatorIdentifier, OpExecWithCode(code, "python") ) - .withDerivePartition(_ => UnknownPartition()) - .withInputPorts(operatorInfo.inputPorts) - .withOutputPorts(operatorInfo.outputPorts) - .withPartitionRequirement(partitionRequirement) - .withIsOneToManyOp(true) .withParallelizable(true) .withSuggestedWorkerNum(workers) - .withPropagateSchema(SchemaPropagationFunc(propagateSchema)) - else + } else { PhysicalOp .manyToOnePhysicalOp( workflowId, @@ -122,13 +110,13 @@ class PythonUDFOpDescV2 extends LogicalOp { operatorIdentifier, OpExecWithCode(code, "python") ) - .withDerivePartition(_ => UnknownPartition()) - .withInputPorts(operatorInfo.inputPorts) - .withOutputPorts(operatorInfo.outputPorts) - .withPartitionRequirement(partitionRequirement) - .withIsOneToManyOp(true) .withParallelizable(false) - .withPropagateSchema(SchemaPropagationFunc(propagateSchema)) + }.withDerivePartition(_ => UnknownPartition()) + .withInputPorts(operatorInfo.inputPorts) + .withOutputPorts(operatorInfo.outputPorts) + .withPartitionRequirement(partitionRequirement) + .withIsOneToManyOp(true) + .withPropagateSchema(SchemaPropagationFunc(propagateSchema)) } override def operatorInfo: OperatorInfo = { @@ -165,27 +153,6 @@ class PythonUDFOpDescV2 extends LogicalOp { allowPortCustomization = true ) } - - override def getOutputSchema(schemas: Array[Schema]): Schema = { - // Preconditions.checkArgument(schemas.length == 1) - val inputSchema = schemas(0) - val outputSchemaBuilder = Schema.builder() - // keep the same schema from input - if (retainInputColumns) outputSchemaBuilder.add(inputSchema) - // for any pythonUDFType, it can add custom output columns (attributes). - if (outputColumns != null) { - if (retainInputColumns) { // check if columns are duplicated - - for (column <- outputColumns) { - if (inputSchema.containsAttribute(column.getName)) - throw new RuntimeException("Column name " + column.getName + " already exists!") - } - } - outputSchemaBuilder.add(outputColumns).build() - } - outputSchemaBuilder.build() - } - override def runtimeReconfiguration( workflowId: WorkflowIdentity, executionId: ExecutionIdentity, diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/source/PythonUDFSourceOpDescV2.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/source/PythonUDFSourceOpDescV2.scala index 086b014ea68..a219ba2808a 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/source/PythonUDFSourceOpDescV2.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/source/PythonUDFSourceOpDescV2.scala @@ -8,7 +8,7 @@ import edu.uci.ics.amber.core.workflow.{PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.source.SourceOperatorDescriptor import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} -import edu.uci.ics.amber.core.workflow.{OutputPort, PortIdentity} +import edu.uci.ics.amber.core.workflow.OutputPort class PythonUDFSourceOpDescV2 extends SourceOperatorDescriptor { @@ -41,18 +41,14 @@ class PythonUDFSourceOpDescV2 extends SourceOperatorDescriptor { executionId: ExecutionIdentity ): PhysicalOp = { require(workers >= 1, "Need at least 1 worker.") - - val func = SchemaPropagationFunc { _: Map[PortIdentity, Schema] => - val outputSchema = sourceSchema() - Map(operatorInfo.outputPorts.head.id -> outputSchema) - } - val physicalOp = PhysicalOp .sourcePhysicalOp(workflowId, executionId, operatorIdentifier, OpExecWithCode(code, "python")) .withInputPorts(operatorInfo.inputPorts) .withOutputPorts(operatorInfo.outputPorts) .withIsOneToManyOp(true) - .withPropagateSchema(func) + .withPropagateSchema( + SchemaPropagationFunc(_ => Map(operatorInfo.outputPorts.head.id -> sourceSchema())) + ) .withLocationPreference(Option.empty) if (workers > 1) { diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/r/RUDFOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/r/RUDFOpDesc.scala index 42445e21e16..bc9d6ec1b5e 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/r/RUDFOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/r/RUDFOpDesc.scala @@ -91,7 +91,7 @@ class RUDFOpDesc extends LogicalOp { } val r_operator_type = if (useTupleAPI) "r-tuple" else "r-table" - if (workers > 1) + if (workers > 1) { PhysicalOp .oneToOnePhysicalOp( workflowId, @@ -99,15 +99,9 @@ class RUDFOpDesc extends LogicalOp { operatorIdentifier, OpExecWithCode(code, r_operator_type) ) - .withDerivePartition(_ => UnknownPartition()) - .withInputPorts(operatorInfo.inputPorts) - .withOutputPorts(operatorInfo.outputPorts) - .withPartitionRequirement(partitionRequirement) - .withIsOneToManyOp(true) .withParallelizable(true) .withSuggestedWorkerNum(workers) - .withPropagateSchema(SchemaPropagationFunc(propagateSchema)) - else + } else { PhysicalOp .manyToOnePhysicalOp( workflowId, @@ -115,13 +109,14 @@ class RUDFOpDesc extends LogicalOp { operatorIdentifier, OpExecWithCode(code, r_operator_type) ) - .withDerivePartition(_ => UnknownPartition()) - .withInputPorts(operatorInfo.inputPorts) - .withOutputPorts(operatorInfo.outputPorts) - .withPartitionRequirement(partitionRequirement) - .withIsOneToManyOp(true) .withParallelizable(false) - .withPropagateSchema(SchemaPropagationFunc(propagateSchema)) + }.withDerivePartition(_ => UnknownPartition()) + .withInputPorts(operatorInfo.inputPorts) + .withOutputPorts(operatorInfo.outputPorts) + .withPartitionRequirement(partitionRequirement) + .withIsOneToManyOp(true) + .withPropagateSchema(SchemaPropagationFunc(propagateSchema)) + } override def operatorInfo: OperatorInfo = { @@ -151,32 +146,10 @@ class RUDFOpDesc extends LogicalOp { "User-defined function operator in R script", OperatorGroupConstants.R_GROUP, inputPortInfo, - outputPortInfo, - dynamicInputPorts = false, - dynamicOutputPorts = false, - supportReconfiguration = false, - allowPortCustomization = false + outputPortInfo ) } - override def getOutputSchema(schemas: Array[Schema]): Schema = { - val inputSchema = schemas(0) - val outputSchemaBuilder = Schema.Builder() - // keep the same schema from input - if (retainInputColumns) outputSchemaBuilder.add(inputSchema) - if (outputColumns != null) { - if (retainInputColumns) { // check if columns are duplicated - - for (column <- outputColumns) { - if (inputSchema.containsAttribute(column.getName)) - throw new RuntimeException("Column name " + column.getName + " already exists!") - } - } - outputSchemaBuilder.add(outputColumns) - } - outputSchemaBuilder.build() - } - override def runtimeReconfiguration( workflowId: WorkflowIdentity, executionId: ExecutionIdentity, diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/r/RUDFSourceOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/r/RUDFSourceOpDesc.scala index 0653228a145..19f65d42c0d 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/r/RUDFSourceOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/r/RUDFSourceOpDesc.scala @@ -8,7 +8,7 @@ import edu.uci.ics.amber.core.workflow.{PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.source.SourceOperatorDescriptor import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} -import edu.uci.ics.amber.core.workflow.{OutputPort, PortIdentity} +import edu.uci.ics.amber.core.workflow.OutputPort class RUDFSourceOpDesc extends SourceOperatorDescriptor { @@ -51,11 +51,6 @@ class RUDFSourceOpDesc extends SourceOperatorDescriptor { val rOperatorType = if (useTupleAPI) "r-tuple" else "r-table" require(workers >= 1, "Need at least 1 worker.") - val func = SchemaPropagationFunc { _: Map[PortIdentity, Schema] => - val outputSchema = sourceSchema() - Map(operatorInfo.outputPorts.head.id -> outputSchema) - } - val physicalOp = PhysicalOp .sourcePhysicalOp( workflowId, @@ -66,7 +61,9 @@ class RUDFSourceOpDesc extends SourceOperatorDescriptor { .withInputPorts(operatorInfo.inputPorts) .withOutputPorts(operatorInfo.outputPorts) .withIsOneToManyOp(true) - .withPropagateSchema(func) + .withPropagateSchema( + SchemaPropagationFunc(_ => Map(operatorInfo.outputPorts.head.id -> sourceSchema())) + ) .withLocationPreference(None) if (workers > 1) { diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/union/UnionOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/union/UnionOpDesc.scala index 7e75c24e7f6..6ee4fea20d9 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/union/UnionOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/union/UnionOpDesc.scala @@ -1,13 +1,10 @@ package edu.uci.ics.amber.operator.union -import com.google.common.base.Preconditions import edu.uci.ics.amber.core.executor.OpExecWithClassName -import edu.uci.ics.amber.core.tuple.Schema -import edu.uci.ics.amber.core.workflow.PhysicalOp +import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} +import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PhysicalOp, PortIdentity} import edu.uci.ics.amber.operator.LogicalOp import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} -import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} class UnionOpDesc extends LogicalOp { @@ -34,10 +31,4 @@ class UnionOpDesc extends LogicalOp { inputPorts = List(InputPort(PortIdentity(0), allowMultiLinks = true)), outputPorts = List(OutputPort()) ) - - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Preconditions.checkArgument(schemas.forall(_ == schemas(0))) - schemas(0) - } - } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/unneststring/UnnestStringOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/unneststring/UnnestStringOpDesc.scala index 5ac736490da..2eb0fefa152 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/unneststring/UnnestStringOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/unneststring/UnnestStringOpDesc.scala @@ -1,7 +1,6 @@ package edu.uci.ics.amber.operator.unneststring import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} -import com.google.common.base.Preconditions import edu.uci.ics.amber.core.executor.OpExecWithClassName import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} import edu.uci.ics.amber.core.workflow.{PhysicalOp, SchemaPropagationFunc} @@ -53,19 +52,17 @@ class UnnestStringOpDesc extends FlatMapOpDesc { .withInputPorts(operatorInfo.inputPorts) .withOutputPorts(operatorInfo.outputPorts) .withPropagateSchema( - SchemaPropagationFunc(inputSchemas => - Map( - operatorInfo.outputPorts.head.id -> getOutputSchema( - operatorInfo.inputPorts.map(_.id).map(inputSchemas(_)).toArray - ) - ) - ) + SchemaPropagationFunc(inputSchemas => { + val outputSchema = + if (resultAttribute == null || resultAttribute.trim.isEmpty) null + else + Schema + .builder() + .add(inputSchemas.values.head) + .add(resultAttribute, AttributeType.STRING) + .build() + Map(operatorInfo.outputPorts.head.id -> outputSchema) + }) ) } - - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Preconditions.checkArgument(schemas.forall(_ == schemas(0))) - if (resultAttribute == null || resultAttribute.trim.isEmpty) return null - Schema.builder().add(schemas(0)).add(resultAttribute, AttributeType.STRING).build() - } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/DotPlot/DotPlotOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/DotPlot/DotPlotOpDesc.scala index c6e6a46d34a..ff082be7b3d 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/DotPlot/DotPlotOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/DotPlot/DotPlotOpDesc.scala @@ -5,7 +5,7 @@ import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} import edu.uci.ics.amber.operator.PythonOperatorDescriptor import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} +import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName @@ -17,8 +17,14 @@ class DotPlotOpDesc extends PythonOperatorDescriptor { @AutofillAttributeName var countAttribute: String = "" - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Schema.builder().add(new Attribute("html-content", AttributeType.STRING)).build() + override def getOutputSchemas( + inputSchemas: Map[PortIdentity, Schema] + ): Map[PortIdentity, Schema] = { + val outputSchema = Schema + .builder() + .add(new Attribute("html-content", AttributeType.STRING)) + .build() + Map(operatorInfo.outputPorts.head.id -> outputSchema) } override def operatorInfo: OperatorInfo = diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/IcicleChart/IcicleChartOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/IcicleChart/IcicleChartOpDesc.scala index 7034e9f6bb9..16e682b4163 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/IcicleChart/IcicleChartOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/IcicleChart/IcicleChartOpDesc.scala @@ -8,7 +8,7 @@ import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName import edu.uci.ics.amber.operator.visualization.hierarchychart.HierarchySection import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} +import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} // type constraint: value can only be numeric @JsonSchemaInject(json = """ @@ -34,8 +34,14 @@ class IcicleChartOpDesc extends PythonOperatorDescriptor { @AutofillAttributeName var value: String = "" - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Schema.builder().add(new Attribute("html-content", AttributeType.STRING)).build() + override def getOutputSchemas( + inputSchemas: Map[PortIdentity, Schema] + ): Map[PortIdentity, Schema] = { + val outputSchema = Schema + .builder() + .add(new Attribute("html-content", AttributeType.STRING)) + .build() + Map(operatorInfo.outputPorts.head.id -> outputSchema) } override def operatorInfo: OperatorInfo = diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/ImageViz/ImageVisualizerOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/ImageViz/ImageVisualizerOpDesc.scala index a1389c426d8..5e85d1979b2 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/ImageViz/ImageVisualizerOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/ImageViz/ImageVisualizerOpDesc.scala @@ -4,7 +4,7 @@ import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} +import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName import edu.uci.ics.amber.operator.PythonOperatorDescriptor @@ -16,8 +16,14 @@ class ImageVisualizerOpDesc extends PythonOperatorDescriptor { @AutofillAttributeName var binaryContent: String = _ - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Schema.builder().add(new Attribute("html-content", AttributeType.STRING)).build() + override def getOutputSchemas( + inputSchemas: Map[PortIdentity, Schema] + ): Map[PortIdentity, Schema] = { + val outputSchema = Schema + .builder() + .add(new Attribute("html-content", AttributeType.STRING)) + .build() + Map(operatorInfo.outputPorts.head.id -> outputSchema) } override def operatorInfo: OperatorInfo = diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/ScatterMatrixChart/ScatterMatrixChartOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/ScatterMatrixChart/ScatterMatrixChartOpDesc.scala index ea84feee197..4b6a366d7c4 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/ScatterMatrixChart/ScatterMatrixChartOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/ScatterMatrixChart/ScatterMatrixChartOpDesc.scala @@ -3,7 +3,7 @@ package edu.uci.ics.amber.operator.visualization.ScatterMatrixChart import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.{JsonSchemaInject, JsonSchemaTitle} import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} +import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.metadata.annotations.{ AutofillAttributeName, @@ -34,8 +34,14 @@ class ScatterMatrixChartOpDesc extends PythonOperatorDescriptor { @AutofillAttributeName var color: String = "" - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Schema.builder().add(new Attribute("html-content", AttributeType.STRING)).build() + override def getOutputSchemas( + inputSchemas: Map[PortIdentity, Schema] + ): Map[PortIdentity, Schema] = { + val outputSchema = Schema + .builder() + .add(new Attribute("html-content", AttributeType.STRING)) + .build() + Map(operatorInfo.outputPorts.head.id -> outputSchema) } override def operatorInfo: OperatorInfo = diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/barChart/BarChartOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/barChart/BarChartOpDesc.scala index e3a8705cab0..c3924b3275d 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/barChart/BarChartOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/barChart/BarChartOpDesc.scala @@ -3,11 +3,11 @@ package edu.uci.ics.amber.operator.visualization.barChart import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.{JsonSchemaInject, JsonSchemaTitle} import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} +import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode +import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} import edu.uci.ics.amber.operator.PythonOperatorDescriptor -import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName -import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} +import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} //type constraint: value can only be numeric @JsonSchemaInject(json = """ @@ -50,8 +50,14 @@ class BarChartOpDesc extends PythonOperatorDescriptor { @AutofillAttributeName var pattern: String = "" - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Schema.builder().add(new Attribute("html-content", AttributeType.STRING)).build() + override def getOutputSchemas( + inputSchemas: Map[PortIdentity, Schema] + ): Map[PortIdentity, Schema] = { + val outputSchema = Schema + .builder() + .add(new Attribute("html-content", AttributeType.STRING)) + .build() + Map(operatorInfo.outputPorts.head.id -> outputSchema) } override def operatorInfo: OperatorInfo = diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/boxPlot/BoxPlotOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/boxPlot/BoxPlotOpDesc.scala index 064992b96d5..5df97e865a1 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/boxPlot/BoxPlotOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/boxPlot/BoxPlotOpDesc.scala @@ -4,7 +4,7 @@ import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.{JsonSchemaInject, JsonSchemaTitle} import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} +import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName import edu.uci.ics.amber.operator.PythonOperatorDescriptor @@ -37,8 +37,14 @@ class BoxPlotOpDesc extends PythonOperatorDescriptor { ) var quertiletype: BoxPlotQuartileFunction = _ - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Schema.builder().add(new Attribute("html-content", AttributeType.STRING)).build() + override def getOutputSchemas( + inputSchemas: Map[PortIdentity, Schema] + ): Map[PortIdentity, Schema] = { + val outputSchema = Schema + .builder() + .add(new Attribute("html-content", AttributeType.STRING)) + .build() + Map(operatorInfo.outputPorts.head.id -> outputSchema) } override def operatorInfo: OperatorInfo = diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/bubbleChart/BubbleChartOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/bubbleChart/BubbleChartOpDesc.scala index aa589a33c24..3a4db9d8e91 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/bubbleChart/BubbleChartOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/bubbleChart/BubbleChartOpDesc.scala @@ -5,7 +5,7 @@ import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} import edu.uci.ics.amber.operator.PythonOperatorDescriptor import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} +import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName @@ -43,8 +43,14 @@ class BubbleChartOpDesc extends PythonOperatorDescriptor { @JsonPropertyDescription("Picks data column to color bubbles with if color is enabled") @AutofillAttributeName var colorCategory: String = "" - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Schema.builder().add(new Attribute("html-content", AttributeType.STRING)).build() + override def getOutputSchemas( + inputSchemas: Map[PortIdentity, Schema] + ): Map[PortIdentity, Schema] = { + val outputSchema = Schema + .builder() + .add(new Attribute("html-content", AttributeType.STRING)) + .build() + Map(operatorInfo.outputPorts.head.id -> outputSchema) } override def operatorInfo: OperatorInfo = diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/candlestickChart/CandlestickChartOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/candlestickChart/CandlestickChartOpDesc.scala index a344e3fb6d6..80ee1ff31e1 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/candlestickChart/CandlestickChartOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/candlestickChart/CandlestickChartOpDesc.scala @@ -7,7 +7,7 @@ import edu.uci.ics.amber.operator.PythonOperatorDescriptor import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} +import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} class CandlestickChartOpDesc extends PythonOperatorDescriptor { @@ -41,8 +41,14 @@ class CandlestickChartOpDesc extends PythonOperatorDescriptor { @AutofillAttributeName var close: String = "" - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Schema.builder().add(new Attribute("html-content", AttributeType.STRING)).build() + override def getOutputSchemas( + inputSchemas: Map[PortIdentity, Schema] + ): Map[PortIdentity, Schema] = { + val outputSchema = Schema + .builder() + .add(new Attribute("html-content", AttributeType.STRING)) + .build() + Map(operatorInfo.outputPorts.head.id -> outputSchema) } override def operatorInfo: OperatorInfo = diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/continuousErrorBands/ContinuousErrorBandsOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/continuousErrorBands/ContinuousErrorBandsOpDesc.scala index e81d1d87f78..78d818cc161 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/continuousErrorBands/ContinuousErrorBandsOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/continuousErrorBands/ContinuousErrorBandsOpDesc.scala @@ -6,7 +6,7 @@ import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} import edu.uci.ics.amber.operator.PythonOperatorDescriptor import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} +import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} import java.util import scala.jdk.CollectionConverters.ListHasAsScala @@ -25,8 +25,14 @@ class ContinuousErrorBandsOpDesc extends PythonOperatorDescriptor { @JsonProperty(value = "bands", required = true) var bands: util.List[BandConfig] = _ - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Schema.builder().add(new Attribute("html-content", AttributeType.STRING)).build() + override def getOutputSchemas( + inputSchemas: Map[PortIdentity, Schema] + ): Map[PortIdentity, Schema] = { + val outputSchema = Schema + .builder() + .add(new Attribute("html-content", AttributeType.STRING)) + .build() + Map(operatorInfo.outputPorts.head.id -> outputSchema) } override def operatorInfo: OperatorInfo = diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/contourPlot/ContourPlotOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/contourPlot/ContourPlotOpDesc.scala index 14001e6d3ec..0a132c2c996 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/contourPlot/ContourPlotOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/contourPlot/ContourPlotOpDesc.scala @@ -7,7 +7,7 @@ import edu.uci.ics.amber.operator.PythonOperatorDescriptor import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} +import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} class ContourPlotOpDesc extends PythonOperatorDescriptor { @@ -46,8 +46,14 @@ class ContourPlotOpDesc extends PythonOperatorDescriptor { ) var coloringMethod: ContourPlotColoringFunction = _ - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Schema.builder().add(new Attribute("html-content", AttributeType.STRING)).build() + override def getOutputSchemas( + inputSchemas: Map[PortIdentity, Schema] + ): Map[PortIdentity, Schema] = { + val outputSchema = Schema + .builder() + .add(new Attribute("html-content", AttributeType.STRING)) + .build() + Map(operatorInfo.outputPorts.head.id -> outputSchema) } override def operatorInfo: OperatorInfo = diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/dumbbellPlot/DumbbellPlotOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/dumbbellPlot/DumbbellPlotOpDesc.scala index eb933ce627d..bac0482bf8a 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/dumbbellPlot/DumbbellPlotOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/dumbbellPlot/DumbbellPlotOpDesc.scala @@ -4,7 +4,7 @@ import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.{JsonSchemaInject, JsonSchemaTitle} import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} +import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName @@ -59,8 +59,14 @@ class DumbbellPlotOpDesc extends PythonOperatorDescriptor { @JsonPropertyDescription("whether show legends in the graph") var showLegends: Boolean = false; - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Schema.builder().add(new Attribute("html-content", AttributeType.STRING)).build() + override def getOutputSchemas( + inputSchemas: Map[PortIdentity, Schema] + ): Map[PortIdentity, Schema] = { + val outputSchema = Schema + .builder() + .add(new Attribute("html-content", AttributeType.STRING)) + .build() + Map(operatorInfo.outputPorts.head.id -> outputSchema) } override def operatorInfo: OperatorInfo = diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/figureFactoryTable/FigureFactoryTableOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/figureFactoryTable/FigureFactoryTableOpDesc.scala index 6c3d93b6f2e..32c250b55dd 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/figureFactoryTable/FigureFactoryTableOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/figureFactoryTable/FigureFactoryTableOpDesc.scala @@ -6,7 +6,7 @@ import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} import edu.uci.ics.amber.operator.PythonOperatorDescriptor import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} +import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} class FigureFactoryTableOpDesc extends PythonOperatorDescriptor { @JsonProperty(required = false) @@ -104,7 +104,13 @@ class FigureFactoryTableOpDesc extends PythonOperatorDescriptor { ) } - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Schema.builder().add(new Attribute("html-content", AttributeType.STRING)).build() + override def getOutputSchemas( + inputSchemas: Map[PortIdentity, Schema] + ): Map[PortIdentity, Schema] = { + val outputSchema = Schema + .builder() + .add(new Attribute("html-content", AttributeType.STRING)) + .build() + Map(operatorInfo.outputPorts.head.id -> outputSchema) } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/filledAreaPlot/FilledAreaPlotOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/filledAreaPlot/FilledAreaPlotOpDesc.scala index 106f424bbc1..2e4e0691a08 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/filledAreaPlot/FilledAreaPlotOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/filledAreaPlot/FilledAreaPlotOpDesc.scala @@ -4,7 +4,7 @@ import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} import edu.uci.ics.amber.operator.PythonOperatorDescriptor -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} +import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode @@ -46,8 +46,14 @@ class FilledAreaPlotOpDesc extends PythonOperatorDescriptor { @AutofillAttributeName var pattern: String = "" - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Schema.builder().add(new Attribute("html-content", AttributeType.STRING)).build() + override def getOutputSchemas( + inputSchemas: Map[PortIdentity, Schema] + ): Map[PortIdentity, Schema] = { + val outputSchema = Schema + .builder() + .add(new Attribute("html-content", AttributeType.STRING)) + .build() + Map(operatorInfo.outputPorts.head.id -> outputSchema) } override def operatorInfo: OperatorInfo = diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/funnelPlot/FunnelPlotOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/funnelPlot/FunnelPlotOpDesc.scala index c9a32bc8044..a7e8075edff 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/funnelPlot/FunnelPlotOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/funnelPlot/FunnelPlotOpDesc.scala @@ -7,7 +7,7 @@ import edu.uci.ics.amber.operator.PythonOperatorDescriptor import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} +import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} @JsonSchemaInject(json = """ { "attributeTypeRules": { @@ -35,8 +35,14 @@ class FunnelPlotOpDesc extends PythonOperatorDescriptor { @AutofillAttributeName var color: String = "" - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Schema.builder().add(new Attribute("html-content", AttributeType.STRING)).build() + override def getOutputSchemas( + inputSchemas: Map[PortIdentity, Schema] + ): Map[PortIdentity, Schema] = { + val outputSchema = Schema + .builder() + .add(new Attribute("html-content", AttributeType.STRING)) + .build() + Map(operatorInfo.outputPorts.head.id -> outputSchema) } override def operatorInfo: OperatorInfo = diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/ganttChart/GanttChartOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/ganttChart/GanttChartOpDesc.scala index 2a34113e9fb..382035b3d64 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/ganttChart/GanttChartOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/ganttChart/GanttChartOpDesc.scala @@ -4,7 +4,7 @@ import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.{JsonSchemaInject, JsonSchemaTitle} import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} import edu.uci.ics.amber.operator.PythonOperatorDescriptor -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} +import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode @@ -53,8 +53,14 @@ class GanttChartOpDesc extends PythonOperatorDescriptor { @AutofillAttributeName var pattern: String = "" - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Schema.builder().add(new Attribute("html-content", AttributeType.STRING)).build() + override def getOutputSchemas( + inputSchemas: Map[PortIdentity, Schema] + ): Map[PortIdentity, Schema] = { + val outputSchema = Schema + .builder() + .add(new Attribute("html-content", AttributeType.STRING)) + .build() + Map(operatorInfo.outputPorts.head.id -> outputSchema) } override def operatorInfo: OperatorInfo = diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/heatMap/HeatMapOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/heatMap/HeatMapOpDesc.scala index 8c11038d25e..3b623fbccc3 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/heatMap/HeatMapOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/heatMap/HeatMapOpDesc.scala @@ -7,7 +7,7 @@ import edu.uci.ics.amber.operator.PythonOperatorDescriptor import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} +import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} class HeatMapOpDesc extends PythonOperatorDescriptor { @JsonProperty(value = "x", required = true) @@ -28,8 +28,14 @@ class HeatMapOpDesc extends PythonOperatorDescriptor { @AutofillAttributeName var value: String = "" - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Schema.builder().add(new Attribute("html-content", AttributeType.STRING)).build() + override def getOutputSchemas( + inputSchemas: Map[PortIdentity, Schema] + ): Map[PortIdentity, Schema] = { + val outputSchema = Schema + .builder() + .add(new Attribute("html-content", AttributeType.STRING)) + .build() + Map(operatorInfo.outputPorts.head.id -> outputSchema) } override def operatorInfo: OperatorInfo = diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/hierarchychart/HierarchyChartOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/hierarchychart/HierarchyChartOpDesc.scala index b23f7b511c2..3e09d51484c 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/hierarchychart/HierarchyChartOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/hierarchychart/HierarchyChartOpDesc.scala @@ -3,11 +3,10 @@ package edu.uci.ics.amber.operator.visualization.hierarchychart import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.{JsonSchemaInject, JsonSchemaTitle} import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} - import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} +import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} import edu.uci.ics.amber.operator.PythonOperatorDescriptor // type constraint: value can only be numeric @JsonSchemaInject(json = """ @@ -38,8 +37,14 @@ class HierarchyChartOpDesc extends PythonOperatorDescriptor { @AutofillAttributeName var value: String = "" - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Schema.builder().add(new Attribute("html-content", AttributeType.STRING)).build() + override def getOutputSchemas( + inputSchemas: Map[PortIdentity, Schema] + ): Map[PortIdentity, Schema] = { + val outputSchema = Schema + .builder() + .add(new Attribute("html-content", AttributeType.STRING)) + .build() + Map(operatorInfo.outputPorts.head.id -> outputSchema) } override def operatorInfo: OperatorInfo = diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/histogram/HistogramChartOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/histogram/HistogramChartOpDesc.scala index 044dad14065..829f5355224 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/histogram/HistogramChartOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/histogram/HistogramChartOpDesc.scala @@ -4,7 +4,7 @@ import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} +import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName import edu.uci.ics.amber.operator.PythonOperatorDescriptor @@ -94,8 +94,14 @@ class HistogramChartOpDesc extends PythonOperatorDescriptor { finalCode } - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Schema.builder().add(new Attribute("html-content", AttributeType.STRING)).build() + override def getOutputSchemas( + inputSchemas: Map[PortIdentity, Schema] + ): Map[PortIdentity, Schema] = { + val outputSchema = Schema + .builder() + .add(new Attribute("html-content", AttributeType.STRING)) + .build() + Map(operatorInfo.outputPorts.head.id -> outputSchema) } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/htmlviz/HtmlVizOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/htmlviz/HtmlVizOpDesc.scala index 2bf48a41ca8..5d84d7e548b 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/htmlviz/HtmlVizOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/htmlviz/HtmlVizOpDesc.scala @@ -4,14 +4,13 @@ import com.fasterxml.jackson.annotation.JsonProperty import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle import edu.uci.ics.amber.core.executor.OpExecWithClassName import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} -import edu.uci.ics.amber.core.workflow.{PhysicalOp, SchemaPropagationFunc} +import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.operator.LogicalOp import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.util.JSONUtils.objectMapper import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} /** * HTML Visualization operator to render any given HTML code @@ -39,13 +38,13 @@ class HtmlVizOpDesc extends LogicalOp { .withInputPorts(operatorInfo.inputPorts) .withOutputPorts(operatorInfo.outputPorts) .withPropagateSchema( - SchemaPropagationFunc(inputSchemas => - Map( - operatorInfo.outputPorts.head.id -> getOutputSchema( - operatorInfo.inputPorts.map(_.id).map(inputSchemas(_)).toArray - ) - ) - ) + SchemaPropagationFunc(inputSchemas => { + val outputSchema = Schema + .builder() + .add(new Attribute("html-content", AttributeType.STRING)) + .build() + Map(operatorInfo.outputPorts.head.id -> outputSchema) + }) ) } @@ -58,6 +57,4 @@ class HtmlVizOpDesc extends LogicalOp { outputPorts = List(OutputPort(mode = OutputMode.SINGLE_SNAPSHOT)) ) - override def getOutputSchema(schemas: Array[Schema]): Schema = - Schema.builder().add(new Attribute("html-content", AttributeType.STRING)).build() } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/lineChart/LineChartOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/lineChart/LineChartOpDesc.scala index e7deebe579d..69eb7f83d12 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/lineChart/LineChartOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/lineChart/LineChartOpDesc.scala @@ -6,7 +6,7 @@ import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} import edu.uci.ics.amber.operator.PythonOperatorDescriptor import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} +import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} import java.util import scala.jdk.CollectionConverters.ListHasAsScala @@ -26,8 +26,14 @@ class LineChartOpDesc extends PythonOperatorDescriptor { @JsonProperty(value = "lines", required = true) var lines: util.List[LineConfig] = _ - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Schema.builder().add(new Attribute("html-content", AttributeType.STRING)).build() + override def getOutputSchemas( + inputSchemas: Map[PortIdentity, Schema] + ): Map[PortIdentity, Schema] = { + val outputSchema = Schema + .builder() + .add(new Attribute("html-content", AttributeType.STRING)) + .build() + Map(operatorInfo.outputPorts.head.id -> outputSchema) } override def operatorInfo: OperatorInfo = diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/pieChart/PieChartOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/pieChart/PieChartOpDesc.scala index 923ca5a619a..5ff0bfa88ae 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/pieChart/PieChartOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/pieChart/PieChartOpDesc.scala @@ -5,7 +5,7 @@ import com.kjetland.jackson.jsonSchema.annotations.{JsonSchemaInject, JsonSchema import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} import edu.uci.ics.amber.operator.PythonOperatorDescriptor import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} +import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName @@ -33,8 +33,14 @@ class PieChartOpDesc extends PythonOperatorDescriptor { @AutofillAttributeName var name: String = "" - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Schema.builder().add(new Attribute("html-content", AttributeType.STRING)).build() + override def getOutputSchemas( + inputSchemas: Map[PortIdentity, Schema] + ): Map[PortIdentity, Schema] = { + val outputSchema = Schema + .builder() + .add(new Attribute("html-content", AttributeType.STRING)) + .build() + Map(operatorInfo.outputPorts.head.id -> outputSchema) } override def operatorInfo: OperatorInfo = diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/quiverPlot/QuiverPlotOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/quiverPlot/QuiverPlotOpDesc.scala index 054d02b8090..58f8759594b 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/quiverPlot/QuiverPlotOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/quiverPlot/QuiverPlotOpDesc.scala @@ -5,7 +5,7 @@ import com.kjetland.jackson.jsonSchema.annotations.{JsonSchemaInject, JsonSchema import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} import edu.uci.ics.amber.operator.PythonOperatorDescriptor import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} +import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName @@ -42,8 +42,14 @@ class QuiverPlotOpDesc extends PythonOperatorDescriptor { @JsonPropertyDescription("column for the vector component in the y-direction") @AutofillAttributeName var v: String = "" - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Schema.builder().add(new Attribute("html-content", AttributeType.STRING)).build() + override def getOutputSchemas( + inputSchemas: Map[PortIdentity, Schema] + ): Map[PortIdentity, Schema] = { + val outputSchema = Schema + .builder() + .add(new Attribute("html-content", AttributeType.STRING)) + .build() + Map(operatorInfo.outputPorts.head.id -> outputSchema) } override def operatorInfo: OperatorInfo = diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/sankeyDiagram/SankeyDiagramOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/sankeyDiagram/SankeyDiagramOpDesc.scala index f00c164743b..ca8cff0cdcb 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/sankeyDiagram/SankeyDiagramOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/sankeyDiagram/SankeyDiagramOpDesc.scala @@ -7,7 +7,7 @@ import edu.uci.ics.amber.operator.PythonOperatorDescriptor import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} +import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} class SankeyDiagramOpDesc extends PythonOperatorDescriptor { @@ -29,8 +29,14 @@ class SankeyDiagramOpDesc extends PythonOperatorDescriptor { @AutofillAttributeName var valueAttribute: String = "" - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Schema.builder().add(new Attribute("html-content", AttributeType.STRING)).build() + override def getOutputSchemas( + inputSchemas: Map[PortIdentity, Schema] + ): Map[PortIdentity, Schema] = { + val outputSchema = Schema + .builder() + .add(new Attribute("html-content", AttributeType.STRING)) + .build() + Map(operatorInfo.outputPorts.head.id -> outputSchema) } override def operatorInfo: OperatorInfo = diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/scatter3DChart/Scatter3dChartOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/scatter3DChart/Scatter3dChartOpDesc.scala index d0e71870398..2a42c2c84dc 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/scatter3DChart/Scatter3dChartOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/scatter3DChart/Scatter3dChartOpDesc.scala @@ -3,7 +3,7 @@ package edu.uci.ics.amber.operator.visualization.scatter3DChart import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.{JsonSchemaInject, JsonSchemaTitle} import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} +import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName import edu.uci.ics.amber.operator.PythonOperatorDescriptor @@ -34,8 +34,14 @@ class Scatter3dChartOpDesc extends PythonOperatorDescriptor { @AutofillAttributeName var z: String = "" - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Schema.builder().add(new Attribute("html-content", AttributeType.STRING)).build() + override def getOutputSchemas( + inputSchemas: Map[PortIdentity, Schema] + ): Map[PortIdentity, Schema] = { + val outputSchema = Schema + .builder() + .add(new Attribute("html-content", AttributeType.STRING)) + .build() + Map(operatorInfo.outputPorts.head.id -> outputSchema) } override def operatorInfo: OperatorInfo = diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/scatterplot/ScatterplotOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/scatterplot/ScatterplotOpDesc.scala index 8195441602b..d56a2c45d1d 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/scatterplot/ScatterplotOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/scatterplot/ScatterplotOpDesc.scala @@ -4,7 +4,7 @@ import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.{JsonSchemaInject, JsonSchemaTitle} import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} import edu.uci.ics.amber.operator.PythonOperatorDescriptor -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} +import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode @@ -60,8 +60,14 @@ class ScatterplotOpDesc extends PythonOperatorDescriptor { @AutofillAttributeName var hoverName: String = "" - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Schema.builder().add(new Attribute("html-content", AttributeType.STRING)).build() + override def getOutputSchemas( + inputSchemas: Map[PortIdentity, Schema] + ): Map[PortIdentity, Schema] = { + val outputSchema = Schema + .builder() + .add(new Attribute("html-content", AttributeType.STRING)) + .build() + Map(operatorInfo.outputPorts.head.id -> outputSchema) } override def operatorInfo: OperatorInfo = diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/tablesChart/TablesPlotOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/tablesChart/TablesPlotOpDesc.scala index 941d681a6ed..87d174d01e9 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/tablesChart/TablesPlotOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/tablesChart/TablesPlotOpDesc.scala @@ -5,7 +5,7 @@ import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} import edu.uci.ics.amber.operator.PythonOperatorDescriptor import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} +import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} class TablesPlotOpDesc extends PythonOperatorDescriptor { @JsonPropertyDescription("List of columns to include in the table chart") @@ -80,7 +80,13 @@ class TablesPlotOpDesc extends PythonOperatorDescriptor { ) } - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Schema.builder().add(new Attribute("html-content", AttributeType.STRING)).build() + override def getOutputSchemas( + inputSchemas: Map[PortIdentity, Schema] + ): Map[PortIdentity, Schema] = { + val outputSchema = Schema + .builder() + .add(new Attribute("html-content", AttributeType.STRING)) + .build() + Map(operatorInfo.outputPorts.head.id -> outputSchema) } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/ternaryPlot/TernaryPlotOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/ternaryPlot/TernaryPlotOpDesc.scala index 91db1d5e1b2..2840ea421da 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/ternaryPlot/TernaryPlotOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/ternaryPlot/TernaryPlotOpDesc.scala @@ -7,7 +7,7 @@ import edu.uci.ics.amber.operator.PythonOperatorDescriptor import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} +import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} /** * Visualization Operator for Ternary Plots. @@ -57,9 +57,14 @@ class TernaryPlotOpDesc extends PythonOperatorDescriptor { outputPorts = List(OutputPort(mode = OutputMode.SINGLE_SNAPSHOT)) ) - /** Returns the output schema set as html-content */ - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Schema.builder().add(new Attribute("html-content", AttributeType.STRING)).build() + override def getOutputSchemas( + inputSchemas: Map[PortIdentity, Schema] + ): Map[PortIdentity, Schema] = { + val outputSchema = Schema + .builder() + .add(new Attribute("html-content", AttributeType.STRING)) + .build() + Map(operatorInfo.outputPorts.head.id -> outputSchema) } /** Returns a Python string that drops any tuples with missing values */ diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/urlviz/UrlVizOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/urlviz/UrlVizOpDesc.scala index 9df368b0ec9..90482deaaa5 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/urlviz/UrlVizOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/urlviz/UrlVizOpDesc.scala @@ -4,10 +4,9 @@ import com.fasterxml.jackson.annotation.JsonProperty import com.kjetland.jackson.jsonSchema.annotations.{JsonSchemaInject, JsonSchemaTitle} import edu.uci.ics.amber.core.executor.OpExecWithClassName import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} -import edu.uci.ics.amber.core.workflow.{PhysicalOp, SchemaPropagationFunc} +import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.operator.LogicalOp import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName import edu.uci.ics.amber.util.JSONUtils.objectMapper @@ -50,13 +49,13 @@ class UrlVizOpDesc extends LogicalOp { .withInputPorts(operatorInfo.inputPorts) .withOutputPorts(operatorInfo.outputPorts) .withPropagateSchema( - SchemaPropagationFunc(inputSchemas => - Map( - operatorInfo.outputPorts.head.id -> getOutputSchema( - operatorInfo.inputPorts.map(_.id).map(inputSchemas(_)).toArray - ) - ) - ) + SchemaPropagationFunc(_ => { + val outputSchema = Schema + .builder() + .add(new Attribute("html-content", AttributeType.STRING)) + .build() + Map(operatorInfo.outputPorts.head.id -> outputSchema) + }) ) } @@ -69,7 +68,4 @@ class UrlVizOpDesc extends LogicalOp { outputPorts = List(OutputPort(mode = OutputMode.SINGLE_SNAPSHOT)) ) - override def getOutputSchema(schemas: Array[Schema]): Schema = - Schema.builder().add(new Attribute("html-content", AttributeType.STRING)).build() - } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/waterfallChart/WaterfallChartOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/waterfallChart/WaterfallChartOpDesc.scala index 15bee2d2506..2ba19165765 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/waterfallChart/WaterfallChartOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/waterfallChart/WaterfallChartOpDesc.scala @@ -7,7 +7,7 @@ import edu.uci.ics.amber.operator.PythonOperatorDescriptor import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} +import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} class WaterfallChartOpDesc extends PythonOperatorDescriptor { @@ -23,8 +23,14 @@ class WaterfallChartOpDesc extends PythonOperatorDescriptor { @AutofillAttributeName var yColumn: String = _ - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Schema.builder().add(new Attribute("html-content", AttributeType.STRING)).build() + override def getOutputSchemas( + inputSchemas: Map[PortIdentity, Schema] + ): Map[PortIdentity, Schema] = { + val outputSchema = Schema + .builder() + .add(new Attribute("html-content", AttributeType.STRING)) + .build() + Map(operatorInfo.outputPorts.head.id -> outputSchema) } override def operatorInfo: OperatorInfo = diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/wordCloud/WordCloudOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/wordCloud/WordCloudOpDesc.scala index 516bc3ab3b4..e6e2c408e48 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/wordCloud/WordCloudOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/wordCloud/WordCloudOpDesc.scala @@ -12,7 +12,7 @@ import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.visualization.ImageUtility import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} +import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} class WordCloudOpDesc extends PythonOperatorDescriptor { @JsonProperty(required = true) @JsonSchemaTitle("Text column") @@ -24,8 +24,14 @@ class WordCloudOpDesc extends PythonOperatorDescriptor { @JsonSchemaInject(ints = Array(new JsonSchemaInt(path = "exclusiveMinimum", value = 0))) var topN: Integer = 100 - override def getOutputSchema(schemas: Array[Schema]): Schema = { - Schema.builder().add(new Attribute("html-content", AttributeType.STRING)).build() + override def getOutputSchemas( + inputSchemas: Map[PortIdentity, Schema] + ): Map[PortIdentity, Schema] = { + val outputSchema = Schema + .builder() + .add(new Attribute("html-content", AttributeType.STRING)) + .build() + Map(operatorInfo.outputPorts.head.id -> outputSchema) } override def operatorInfo: OperatorInfo = diff --git a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/cartesianProduct/CartesianProductOpExecSpec.scala b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/cartesianProduct/CartesianProductOpExecSpec.scala index a913ff15576..60725d53295 100644 --- a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/cartesianProduct/CartesianProductOpExecSpec.scala +++ b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/cartesianProduct/CartesianProductOpExecSpec.scala @@ -8,6 +8,7 @@ import edu.uci.ics.amber.core.tuple.{ Tuple, TupleLike } +import edu.uci.ics.amber.core.workflow.PortIdentity import org.scalatest.BeforeAndAfter import org.scalatest.flatspec.AnyFlatSpec @@ -102,8 +103,8 @@ class CartesianProductOpExecSpec extends AnyFlatSpec with BeforeAndAfter { .add(generate_schema("right", numRightSchemaAttributes - 1)) .add(duplicateAttribute) .build() - val inputSchemas = Array(leftSchema, rightSchema) - val outputSchema = opDesc.getOutputSchema(inputSchemas) + val inputSchemas = Map(PortIdentity() -> leftSchema, PortIdentity(1) -> rightSchema) + val outputSchema = opDesc.getExternalOutputSchemas(inputSchemas).values.head // verify output schema is as expected & has no duplicates assert( diff --git a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/dictionary/DictionaryMatcherOpExecSpec.scala b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/dictionary/DictionaryMatcherOpExecSpec.scala index f952d847e7f..1d19700e071 100644 --- a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/dictionary/DictionaryMatcherOpExecSpec.scala +++ b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/dictionary/DictionaryMatcherOpExecSpec.scala @@ -1,6 +1,7 @@ package edu.uci.ics.amber.operator.dictionary import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema, SchemaEnforceable, Tuple} +import edu.uci.ics.amber.core.workflow.PortIdentity import edu.uci.ics.amber.util.JSONUtils.objectMapper import org.scalatest.BeforeAndAfter import org.scalatest.flatspec.AnyFlatSpec @@ -35,7 +36,7 @@ class DictionaryMatcherOpExecSpec extends AnyFlatSpec with BeforeAndAfter { opDesc.dictionary = dictionaryScan opDesc.resultAttribute = "matched" opDesc.matchingType = MatchingType.SCANBASED - outputSchema = opDesc.getOutputSchema(Array(tupleSchema)) + outputSchema = opDesc.getExternalOutputSchemas(Map(PortIdentity() -> tupleSchema)).values.head } it should "open" in { diff --git a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/hashJoin/HashJoinOpSpec.scala b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/hashJoin/HashJoinOpSpec.scala index 2049d89f7f4..c3cd8f6ecd3 100644 --- a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/hashJoin/HashJoinOpSpec.scala +++ b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/hashJoin/HashJoinOpSpec.scala @@ -11,6 +11,7 @@ import edu.uci.ics.amber.core.tuple.{ Tuple, TupleLike } +import edu.uci.ics.amber.core.workflow.PortIdentity import edu.uci.ics.amber.operator.hashJoin.HashJoinBuildOpExec import edu.uci.ics.amber.util.JSONUtils.objectMapper class HashJoinOpSpec extends AnyFlatSpec with BeforeAndAfter { @@ -53,8 +54,8 @@ class HashJoinOpSpec extends AnyFlatSpec with BeforeAndAfter { opDesc.buildAttributeName = "build_1" opDesc.probeAttributeName = "probe_1" opDesc.joinType = JoinType.INNER - val inputSchemas = Array(schema("build"), schema("probe")) - val outputSchema = opDesc.getOutputSchema(inputSchemas) + val inputSchemas = Map(PortIdentity() -> schema("build"), PortIdentity(1) -> schema("probe")) + val outputSchema = opDesc.getExternalOutputSchemas(inputSchemas).values.head buildOpExec = new HashJoinBuildOpExec[String](objectMapper.writeValueAsString(opDesc)) buildOpExec.open() @@ -79,7 +80,7 @@ class HashJoinOpSpec extends AnyFlatSpec with BeforeAndAfter { buildOpOutputIterator .next() .asInstanceOf[SchemaEnforceable] - .enforceSchema(getInternalHashTableSchema(inputSchemas.head)), + .enforceSchema(getInternalHashTableSchema(inputSchemas.head._2)), build ) .isEmpty @@ -109,8 +110,9 @@ class HashJoinOpSpec extends AnyFlatSpec with BeforeAndAfter { opDesc.buildAttributeName = "same" opDesc.probeAttributeName = "same" opDesc.joinType = JoinType.INNER - val inputSchemas = Array(schema("same", 1), schema("same", 2)) - val outputSchema = opDesc.getOutputSchema(inputSchemas) + val inputSchemas = + Map(PortIdentity() -> schema("same", 1), PortIdentity(1) -> schema("same", 2)) + val outputSchema = opDesc.getExternalOutputSchemas(inputSchemas).values.head buildOpExec = new HashJoinBuildOpExec[String](objectMapper.writeValueAsString(opDesc)) buildOpExec.open() @@ -134,7 +136,7 @@ class HashJoinOpSpec extends AnyFlatSpec with BeforeAndAfter { buildOpOutputIterator .next() .asInstanceOf[SchemaEnforceable] - .enforceSchema(getInternalHashTableSchema(inputSchemas.head)), + .enforceSchema(getInternalHashTableSchema(inputSchemas.head._2)), build ) .isEmpty @@ -163,8 +165,9 @@ class HashJoinOpSpec extends AnyFlatSpec with BeforeAndAfter { opDesc.buildAttributeName = "same" opDesc.probeAttributeName = "same" opDesc.joinType = JoinType.FULL_OUTER - val inputSchemas = Array(schema("same", 1), schema("same", 2)) - val outputSchema = opDesc.getOutputSchema(inputSchemas) + val inputSchemas = + Map(PortIdentity() -> schema("same", 1), PortIdentity(1) -> schema("same", 2)) + val outputSchema = opDesc.getExternalOutputSchemas(inputSchemas).values.head buildOpExec = new HashJoinBuildOpExec[String](objectMapper.writeValueAsString(opDesc)) buildOpExec.open() @@ -188,7 +191,7 @@ class HashJoinOpSpec extends AnyFlatSpec with BeforeAndAfter { buildOpOutputIterator .next() .asInstanceOf[SchemaEnforceable] - .enforceSchema(getInternalHashTableSchema(inputSchemas.head)), + .enforceSchema(getInternalHashTableSchema(inputSchemas.head._2)), build ) .isEmpty diff --git a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/intervalJoin/IntervalOpExecSpec.scala b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/intervalJoin/IntervalOpExecSpec.scala index 72c062ed319..e8c26d84a23 100644 --- a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/intervalJoin/IntervalOpExecSpec.scala +++ b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/intervalJoin/IntervalOpExecSpec.scala @@ -213,7 +213,10 @@ class IntervalOpExecSpec extends AnyFlatSpec with BeforeAndAfter { rightInput: Array[T] ): Unit = { val inputSchemas = - Array(schema(leftKey, dataType), schema(rightKey, dataType)) + Map( + PortIdentity() -> schema(leftKey, dataType), + PortIdentity(1) -> schema(rightKey, dataType) + ) opDesc = new IntervalJoinOpDesc( leftKey, rightKey, @@ -222,7 +225,7 @@ class IntervalOpExecSpec extends AnyFlatSpec with BeforeAndAfter { includeRightBound, timeIntervalType ) - val outputSchema = opDesc.getOutputSchema(inputSchemas) + val outputSchema = opDesc.getExternalOutputSchemas(inputSchemas).values.head val opExec = new IntervalJoinOpExec(objectMapper.writeValueAsString(opDesc)) opExec.open() counter = 0 diff --git a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/projection/ProjectionOpDescSpec.scala b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/projection/ProjectionOpDescSpec.scala index b23aeb484ea..32f3cb5a86b 100644 --- a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/projection/ProjectionOpDescSpec.scala +++ b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/projection/ProjectionOpDescSpec.scala @@ -1,6 +1,7 @@ package edu.uci.ics.amber.operator.projection import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} +import edu.uci.ics.amber.core.workflow.PortIdentity import org.scalatest.BeforeAndAfter import org.scalatest.flatspec.AnyFlatSpec class ProjectionOpDescSpec extends AnyFlatSpec with BeforeAndAfter { @@ -30,7 +31,8 @@ class ProjectionOpDescSpec extends AnyFlatSpec with BeforeAndAfter { new AttributeUnit("field1", "f1"), new AttributeUnit("field2", "f2") ) - val outputSchema = projectionOpDesc.getOutputSchema(Array(schema)) + val outputSchema = + projectionOpDesc.getExternalOutputSchemas(Map(PortIdentity() -> schema)).values.head assert(outputSchema.getAttributes.length == 2) } @@ -40,7 +42,8 @@ class ProjectionOpDescSpec extends AnyFlatSpec with BeforeAndAfter { new AttributeUnit("field2", "f2"), new AttributeUnit("field1", "f1") ) - val outputSchema = projectionOpDesc.getOutputSchema(Array(schema)) + val outputSchema = + projectionOpDesc.getExternalOutputSchemas(Map(PortIdentity() -> schema)).values.head assert(outputSchema.getAttributes.length == 2) assert(outputSchema.getIndex("f2") == 0) assert(outputSchema.getIndex("f1") == 1) @@ -53,7 +56,7 @@ class ProjectionOpDescSpec extends AnyFlatSpec with BeforeAndAfter { new AttributeUnit("field---6", "f6") ) assertThrows[RuntimeException] { - projectionOpDesc.getOutputSchema(Array(schema)) + projectionOpDesc.getExternalOutputSchemas(Map(PortIdentity() -> schema)).values.head } } @@ -61,20 +64,7 @@ class ProjectionOpDescSpec extends AnyFlatSpec with BeforeAndAfter { it should "raise IllegalArgumentException on empty attributes" in { assertThrows[IllegalArgumentException] { - projectionOpDesc.getOutputSchema(Array(schema)) - } - - } - - it should "raise IllegalArgumentException with multiple input source Schema" in { - - projectionOpDesc.attributes ++= List( - new AttributeUnit("field2", "f2"), - new AttributeUnit("field1", "f1") - ) - - assertThrows[IllegalArgumentException] { - projectionOpDesc.getOutputSchema(Array(schema, schema)) + projectionOpDesc.getExternalOutputSchemas(Map(PortIdentity() -> schema)).values.head } } @@ -86,7 +76,7 @@ class ProjectionOpDescSpec extends AnyFlatSpec with BeforeAndAfter { new AttributeUnit("field1", "f") ) assertThrows[RuntimeException] { - projectionOpDesc.getOutputSchema(Array(schema)) + projectionOpDesc.getExternalOutputSchemas(Map(PortIdentity() -> schema)).values.head } } @@ -95,7 +85,8 @@ class ProjectionOpDescSpec extends AnyFlatSpec with BeforeAndAfter { new AttributeUnit("field1", "f1"), new AttributeUnit("field2", "") ) - val outputSchema = projectionOpDesc.getOutputSchema(Array(schema)) + val outputSchema = + projectionOpDesc.getExternalOutputSchemas(Map(PortIdentity() -> schema)).values.head assert(outputSchema.getAttributes.length == 2) } diff --git a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/source/scan/csv/CSVScanSourceOpDescSpec.scala b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/source/scan/csv/CSVScanSourceOpDescSpec.scala index bf1a6122e1c..55a09751ffe 100644 --- a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/source/scan/csv/CSVScanSourceOpDescSpec.scala +++ b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/source/scan/csv/CSVScanSourceOpDescSpec.scala @@ -4,7 +4,6 @@ import edu.uci.ics.amber.core.storage.FileResolver import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} import edu.uci.ics.amber.core.workflow.WorkflowContext.{DEFAULT_EXECUTION_ID, DEFAULT_WORKFLOW_ID} import edu.uci.ics.amber.operator.TestOperators -import edu.uci.ics.amber.core.workflow.PortIdentity import org.scalatest.BeforeAndAfter import org.scalatest.flatspec.AnyFlatSpec @@ -14,11 +13,7 @@ class CSVScanSourceOpDescSpec extends AnyFlatSpec with BeforeAndAfter { var parallelCsvScanSourceOpDesc: ParallelCSVScanSourceOpDesc = _ before { csvScanSourceOpDesc = new CSVScanSourceOpDesc() - csvScanSourceOpDesc.outputPortToSchemaMapping(PortIdentity()) = - csvScanSourceOpDesc.getOutputSchema(Array()) parallelCsvScanSourceOpDesc = new ParallelCSVScanSourceOpDesc() - parallelCsvScanSourceOpDesc.outputPortToSchemaMapping(PortIdentity()) = - parallelCsvScanSourceOpDesc.getOutputSchema(Array()) } it should "infer schema from single-line-data csv" in { diff --git a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/udf/python/PythonLambdaFunctionOpDescSpec.scala b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/udf/python/PythonLambdaFunctionOpDescSpec.scala index 147384f71e4..190da462e99 100644 --- a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/udf/python/PythonLambdaFunctionOpDescSpec.scala +++ b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/udf/python/PythonLambdaFunctionOpDescSpec.scala @@ -1,6 +1,7 @@ package edu.uci.ics.amber.operator.udf.python import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} +import edu.uci.ics.amber.core.workflow.PortIdentity import org.scalatest.BeforeAndAfter import org.scalatest.flatspec.AnyFlatSpec class PythonLambdaFunctionOpDescSpec extends AnyFlatSpec with BeforeAndAfter { @@ -25,7 +26,7 @@ class PythonLambdaFunctionOpDescSpec extends AnyFlatSpec with BeforeAndAfter { AttributeType.STRING ) ) - val outputSchema = opDesc.getOutputSchema(Array(schema)) + val outputSchema = opDesc.getExternalOutputSchemas(Map(PortIdentity() -> schema)).values.head assert(outputSchema.getAttributes.length == 4) } @@ -44,7 +45,7 @@ class PythonLambdaFunctionOpDescSpec extends AnyFlatSpec with BeforeAndAfter { AttributeType.INTEGER ) ) - val outputSchema = opDesc.getOutputSchema(Array(schema)) + val outputSchema = opDesc.getExternalOutputSchemas(Map(PortIdentity() -> schema)).values.head assert(outputSchema.getAttributes.length == 5) } @@ -57,7 +58,7 @@ class PythonLambdaFunctionOpDescSpec extends AnyFlatSpec with BeforeAndAfter { AttributeType.STRING ) ) - val outputSchema = opDesc.getOutputSchema(Array(schema)) + val outputSchema = opDesc.getExternalOutputSchemas(Map(PortIdentity() -> schema)).values.head assert(outputSchema.getAttributes.length == 3) } @@ -72,7 +73,7 @@ class PythonLambdaFunctionOpDescSpec extends AnyFlatSpec with BeforeAndAfter { ) assertThrows[RuntimeException] { - opDesc.getOutputSchema(Array(schema)) + opDesc.getExternalOutputSchemas(Map(PortIdentity() -> schema)).values.head } } diff --git a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/unneststring/UnnestStringOpExecSpec.scala b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/unneststring/UnnestStringOpExecSpec.scala index 8ce75b6fd5f..d63b82900bb 100644 --- a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/unneststring/UnnestStringOpExecSpec.scala +++ b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/unneststring/UnnestStringOpExecSpec.scala @@ -28,15 +28,13 @@ class UnnestStringOpExecSpec extends AnyFlatSpec with BeforeAndAfter { opDesc.attribute = "field1" opDesc.delimiter = "-" opDesc.resultAttribute = "split" - opDesc.inputPortToSchemaMapping(PortIdentity()) = tupleSchema - opDesc.outputPortToSchemaMapping(PortIdentity()) = opDesc.getOutputSchema(Array(tupleSchema)) } it should "open" in { opDesc.attribute = "field1" opDesc.delimiter = "-" opExec = new UnnestStringOpExec(objectMapper.writeValueAsString(opDesc)) - outputSchema = opDesc.getOutputSchema(Array(tupleSchema)) + outputSchema = opDesc.getExternalOutputSchemas(Map(PortIdentity() -> tupleSchema)).values.head opExec.open() assert(opExec.flatMapFunc != null) } @@ -45,7 +43,7 @@ class UnnestStringOpExecSpec extends AnyFlatSpec with BeforeAndAfter { opDesc.attribute = "field1" opDesc.delimiter = "-" opExec = new UnnestStringOpExec(objectMapper.writeValueAsString(opDesc)) - outputSchema = opDesc.getOutputSchema(Array(tupleSchema)) + outputSchema = opDesc.getExternalOutputSchemas(Map(PortIdentity() -> tupleSchema)).values.head opExec.open() val processedTuple = opExec .processTuple(tuple, 0) @@ -61,7 +59,7 @@ class UnnestStringOpExecSpec extends AnyFlatSpec with BeforeAndAfter { opDesc.attribute = "field3" opDesc.delimiter = "-" opExec = new UnnestStringOpExec(objectMapper.writeValueAsString(opDesc)) - outputSchema = opDesc.getOutputSchema(Array(tupleSchema)) + outputSchema = opDesc.getExternalOutputSchemas(Map(PortIdentity() -> tupleSchema)).values.head opExec.open() val processedTuple = opExec .processTuple(tuple, 0) @@ -75,7 +73,7 @@ class UnnestStringOpExecSpec extends AnyFlatSpec with BeforeAndAfter { opDesc.attribute = "field1" opDesc.delimiter = "/" opExec = new UnnestStringOpExec(objectMapper.writeValueAsString(opDesc)) - outputSchema = opDesc.getOutputSchema(Array(tupleSchema)) + outputSchema = opDesc.getExternalOutputSchemas(Map(PortIdentity() -> tupleSchema)).values.head val tuple: Tuple = Tuple .builder(tupleSchema) .add(new Attribute("field1", AttributeType.STRING), "//a//b/") @@ -97,7 +95,7 @@ class UnnestStringOpExecSpec extends AnyFlatSpec with BeforeAndAfter { opDesc.attribute = "field1" opDesc.delimiter = "<\\d*>" opExec = new UnnestStringOpExec(objectMapper.writeValueAsString(opDesc)) - outputSchema = opDesc.getOutputSchema(Array(tupleSchema)) + outputSchema = opDesc.getExternalOutputSchemas(Map(PortIdentity() -> tupleSchema)).values.head val tuple: Tuple = Tuple .builder(tupleSchema) .add(new Attribute("field1", AttributeType.STRING), "<>a<1>b<12>") diff --git a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/visualization/htmlviz/HtmlVizOpExecSpec.scala b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/visualization/htmlviz/HtmlVizOpExecSpec.scala index f8aa526a0c9..b636703c461 100644 --- a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/visualization/htmlviz/HtmlVizOpExecSpec.scala +++ b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/visualization/htmlviz/HtmlVizOpExecSpec.scala @@ -1,6 +1,7 @@ package edu.uci.ics.amber.operator.visualization.htmlviz import edu.uci.ics.amber.core.tuple._ +import edu.uci.ics.amber.core.workflow.PortIdentity import edu.uci.ics.amber.util.JSONUtils.objectMapper import org.scalatest.BeforeAndAfter import org.scalatest.flatspec.AnyFlatSpec @@ -11,7 +12,8 @@ class HtmlVizOpExecSpec extends AnyFlatSpec with BeforeAndAfter { ) val opDesc: HtmlVizOpDesc = new HtmlVizOpDesc() - val outputSchema: Schema = opDesc.getOutputSchema(Array(schema)) + val outputSchema: Schema = + opDesc.getExternalOutputSchemas(Map(PortIdentity() -> schema)).values.head def tuple(): Tuple = Tuple From 2556432685df3c13c59e08e6cbae56d32400d062 Mon Sep 17 00:00:00 2001 From: Yicong Huang <17627829+Yicong-Huang@users.noreply.github.com> Date: Tue, 31 Dec 2024 15:34:17 -0800 Subject: [PATCH 06/10] Add single schema per port validation during compilation (#3187) Each port must have exactly one schema. If multiple links are connected to the same port, they are required to share the same schema. This PR introduces a validation step during schema propagation to ensure this constraint is enforced as part of the compilation process. For example, consider a Union operator with a single input port that supports multiple links. If upstream operators produce differing output schemas, the validation will fail with an appropriate error message: ![CleanShot 2024-12-31 at 14 56 36](https://github.com/user-attachments/assets/077594b4-26fb-4983-9ce4-d7c67365645e) --- .../uci/ics/amber/core/workflow/PhysicalOp.scala | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/workflow/PhysicalOp.scala b/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/workflow/PhysicalOp.scala index d493f0891a5..00f244d6e15 100644 --- a/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/workflow/PhysicalOp.scala +++ b/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/workflow/PhysicalOp.scala @@ -398,8 +398,18 @@ case class PhysicalOp( */ def propagateSchema(newInputSchema: Option[(PortIdentity, Schema)] = None): PhysicalOp = { // Update the input schema if a new one is provided - val updatedOp = newInputSchema.foldLeft(this) { - case (op, (portId, schema)) => op.withInputSchema(portId, Right(schema)) + val updatedOp = newInputSchema.foldLeft(this) { (op, schemaEntry) => + val (portId, schema) = schemaEntry + op.inputPorts(portId)._3 match { + case Left(_) => + op.withInputSchema(portId, Right(schema)) + case Right(existingSchema) if existingSchema != schema => + throw new IllegalArgumentException( + s"Conflict schemas received on port ${portId.id}, $existingSchema != $schema" + ) + case _ => + op + } } // Extract input schemas, checking if all are defined From bf6ffc9bea1562d60e1723407d9bdb45fda868ce Mon Sep 17 00:00:00 2001 From: Xiaozhen Liu Date: Wed, 1 Jan 2025 18:57:03 +0800 Subject: [PATCH 07/10] Add Cost Estimator Using Past Statistics for Schedule Generator (#3156) #### This PR introduces the `CostEstimator` trait which estimates the cost of a region, given some resource units. - The cost estimator is used by `CostBasedScheduleGenerator` to calculate the cost of a schedule during search. - Currently we only consider one type of schedule for each region plan, which is a total order of the regions. The cost of the schedule (and also the cost of the region plan) is thus the summation of the cost of each region. - The resource units are currently passed as placeholders because we assume a region will have all the resources when doing the estimation. The units may be used in the future if we consider different methods of schedule-generation. For example, if we allow two regions to run concurrently, the units will be split in half for each region. #### A `DefaultCostEstimator` implementation is also added, which uses past execution statistics to estimate the wall-clock runtime of a region: - The runtime of each region is represented by the runtime of its longest-running operator. - The runtime of operators are estimated using the statistics from the **latest successful execution** of the workflow. - If such statistics do not exist (e.g., if it is the first execution, or if past executions all failed), we fall back to using number of materialized edges as the cost. - Added test cases using mock mysql data. --- .../CostBasedScheduleGenerator.scala | 42 ++- .../scheduling/CostEstimator.scala | 148 ++++++++++ .../scheduling/DefaultCostEstimatorSpec.scala | 258 ++++++++++++++++++ 3 files changed, 435 insertions(+), 13 deletions(-) create mode 100644 core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/scheduling/CostEstimator.scala create mode 100644 core/amber/src/test/scala/edu/uci/ics/amber/engine/architecture/scheduling/DefaultCostEstimatorSpec.scala diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/scheduling/CostBasedScheduleGenerator.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/scheduling/CostBasedScheduleGenerator.scala index 457bd15e6ab..eb848c1d8f1 100644 --- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/scheduling/CostBasedScheduleGenerator.scala +++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/scheduling/CostBasedScheduleGenerator.scala @@ -34,6 +34,9 @@ class CostBasedScheduleGenerator( numStatesExplored: Int = 0 ) + private val costEstimator = + new DefaultCostEstimator(workflowContext = workflowContext, actorId = actorId) + def generate(): (Schedule, PhysicalPlan) = { val startTime = System.nanoTime() val regionDAG = createRegionDAG() @@ -281,7 +284,9 @@ class CostBasedScheduleGenerator( if (oEarlyStop) schedulableStates.add(currentState) // Calculate the current state's cost and update the bestResult if it's lower val cost = - evaluate(regionDAG.vertexSet().asScala.toSet, regionDAG.edgeSet().asScala.toSet) + evaluate( + RegionPlan(regionDAG.vertexSet().asScala.toSet, regionDAG.edgeSet().asScala.toSet) + ) if (cost < bestResult.cost) { bestResult = SearchResult(currentState, regionDAG, cost) } @@ -334,7 +339,12 @@ class CostBasedScheduleGenerator( physicalPlan.getNonMaterializedBlockingAndDependeeLinks ++ neighborState ) match { case Left(regionDAG) => - evaluate(regionDAG.vertexSet().asScala.toSet, regionDAG.edgeSet().asScala.toSet) + evaluate( + RegionPlan( + regionDAG.vertexSet().asScala.toSet, + regionDAG.edgeSet().asScala.toSet + ) + ) case Right(_) => Double.MaxValue } @@ -423,7 +433,9 @@ class CostBasedScheduleGenerator( def updateOptimumIfApplicable(regionDAG: DirectedAcyclicGraph[Region, RegionLink]): Unit = { // Calculate the current state's cost and update the bestResult if it's lower val cost = - evaluate(regionDAG.vertexSet().asScala.toSet, regionDAG.edgeSet().asScala.toSet) + evaluate( + RegionPlan(regionDAG.vertexSet().asScala.toSet, regionDAG.edgeSet().asScala.toSet) + ) if (cost < bestResult.cost) { bestResult = SearchResult(currentState, regionDAG, cost) } @@ -453,7 +465,12 @@ class CostBasedScheduleGenerator( physicalPlan.getNonMaterializedBlockingAndDependeeLinks ++ neighborState ) match { case Left(regionDAG) => - evaluate(regionDAG.vertexSet().asScala.toSet, regionDAG.edgeSet().asScala.toSet) + evaluate( + RegionPlan( + regionDAG.vertexSet().asScala.toSet, + regionDAG.edgeSet().asScala.toSet + ) + ) case Right(_) => Double.MaxValue } @@ -472,17 +489,16 @@ class CostBasedScheduleGenerator( } /** - * The cost function used by the search. Takes in a region graph represented as set of regions and links. + * The cost function used by the search. Takes a region plan, generates one or more (to be done in the future) + * schedules based on the region plan, and calculates the cost of the schedule(s) using Cost Estimator. Uses the cost + * of the best schedule (currently only considers one schedule) as the cost of the region plan. * - * @param regions A set of regions created based on a search state. - * @param regionLinks A set of links to indicate dependencies between regions, based on the materialization edges. - * @return A cost determined by the resource allocator. + * @return A cost determined by the cost estimator. */ - private def evaluate(regions: Set[Region], regionLinks: Set[RegionLink]): Double = { - // Using number of materialized ports as the cost. - // This is independent of the schedule / resource allocator. - // In the future we may need to use the ResourceAllocator to get the cost. - regions.flatMap(_.materializedPortIds).size + private def evaluate(regionPlan: RegionPlan): Double = { + val schedule = generateScheduleFromRegionPlan(regionPlan) + // In the future we may allow multiple regions in a level and split the resources. + schedule.map(level => level.map(region => costEstimator.estimate(region, 1)).sum).sum } } diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/scheduling/CostEstimator.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/scheduling/CostEstimator.scala new file mode 100644 index 00000000000..c675d44f154 --- /dev/null +++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/scheduling/CostEstimator.scala @@ -0,0 +1,148 @@ +package edu.uci.ics.amber.engine.architecture.scheduling + +import edu.uci.ics.amber.core.storage.StorageConfig +import edu.uci.ics.amber.core.workflow.WorkflowContext +import edu.uci.ics.amber.engine.architecture.scheduling.DefaultCostEstimator.DEFAULT_OPERATOR_COST +import edu.uci.ics.amber.engine.common.AmberLogging +import edu.uci.ics.amber.core.virtualidentity.ActorVirtualIdentity +import edu.uci.ics.texera.dao.SqlServer +import edu.uci.ics.texera.dao.SqlServer.withTransaction +import edu.uci.ics.texera.dao.jooq.generated.Tables.{ + WORKFLOW_EXECUTIONS, + WORKFLOW_RUNTIME_STATISTICS, + WORKFLOW_VERSION +} +import edu.uci.ics.texera.dao.jooq.generated.tables.pojos.WorkflowRuntimeStatistics +import org.jooq.types.UInteger + +import scala.jdk.CollectionConverters.ListHasAsScala +import scala.util.{Failure, Success, Try} + +/** + * A cost estimator should estimate a cost of running a region under the given resource constraints as units. + */ +trait CostEstimator { + def estimate(region: Region, resourceUnits: Int): Double +} + +object DefaultCostEstimator { + val DEFAULT_OPERATOR_COST: Double = 1.0 +} + +/** + * A default cost estimator using past statistics. If past statistics of a workflow are available, the cost of a region + * is the execution time of its longest-running operator. Otherwise the cost is the number of materialized ports in the + * region. + */ +class DefaultCostEstimator( + workflowContext: WorkflowContext, + val actorId: ActorVirtualIdentity +) extends CostEstimator + with AmberLogging { + + // Requires mysql database to retrieve execution statistics, otherwise use number of materialized ports as a default. + private val operatorEstimatedTimeOption = Try( + this.getOperatorExecutionTimeInSeconds( + this.workflowContext.workflowId.id + ) + ) match { + case Failure(_) => None + case Success(result) => result + } + + operatorEstimatedTimeOption match { + case None => + logger.info( + s"WID: ${workflowContext.workflowId.id}, EID: ${workflowContext.executionId.id}, " + + s"no past execution statistics available. Using number of materialized output ports as the cost. " + ) + case Some(_) => + } + + override def estimate(region: Region, resourceUnits: Int): Double = { + this.operatorEstimatedTimeOption match { + case Some(operatorEstimatedTime) => + // Use past statistics (wall-clock runtime). We use the execution time of the longest-running + // operator in each region to represent the region's execution time, and use the sum of all the regions' + // execution time as the wall-clock runtime of the workflow. + // This assumes a schedule is a total-order of the regions. + val opExecutionTimes = region.getOperators.map(op => { + operatorEstimatedTime.getOrElse(op.id.logicalOpId.id, DEFAULT_OPERATOR_COST) + }) + val longestRunningOpExecutionTime = opExecutionTimes.max + longestRunningOpExecutionTime + case None => + // Without past statistics (e.g., first execution), we use number of materialized ports as the cost. + // This is independent of the schedule / resource allocator. + region.materializedPortIds.size + } + } + + /** + * Retrieve the latest successful execution to get statistics to calculate costs in DefaultCostEstimator. + * Using the total control processing time plus data processing time of an operator as its cost. + * If no past statistics are available (e.g., first execution), return None. + */ + private def getOperatorExecutionTimeInSeconds( + wid: Long + ): Option[Map[String, Double]] = { + + val operatorEstimatedTimeOption = withTransaction( + SqlServer + .getInstance( + StorageConfig.jdbcUrl, + StorageConfig.jdbcUsername, + StorageConfig.jdbcPassword + ) + .createDSLContext() + ) { context => + val widAsUInteger = UInteger.valueOf(wid) + val rawStats = context + .select( + WORKFLOW_RUNTIME_STATISTICS.OPERATOR_ID, + WORKFLOW_RUNTIME_STATISTICS.TIME, + WORKFLOW_RUNTIME_STATISTICS.DATA_PROCESSING_TIME, + WORKFLOW_RUNTIME_STATISTICS.CONTROL_PROCESSING_TIME, + WORKFLOW_RUNTIME_STATISTICS.EXECUTION_ID + ) + .from(WORKFLOW_RUNTIME_STATISTICS) + .where( + WORKFLOW_RUNTIME_STATISTICS.WORKFLOW_ID + .eq(widAsUInteger) + .and( + WORKFLOW_RUNTIME_STATISTICS.EXECUTION_ID.eq( + context + .select( + WORKFLOW_EXECUTIONS.EID + ) + .from(WORKFLOW_EXECUTIONS) + .join(WORKFLOW_VERSION) + .on(WORKFLOW_VERSION.VID.eq(WORKFLOW_EXECUTIONS.VID)) + .where( + WORKFLOW_VERSION.WID + .eq(widAsUInteger) + .and(WORKFLOW_EXECUTIONS.STATUS.eq(3.toByte)) + ) + .orderBy(WORKFLOW_EXECUTIONS.STARTING_TIME.desc()) + .limit(1) + ) + ) + ) + .orderBy(WORKFLOW_RUNTIME_STATISTICS.TIME, WORKFLOW_RUNTIME_STATISTICS.OPERATOR_ID) + .fetchInto(classOf[WorkflowRuntimeStatistics]) + .asScala + .toList + if (rawStats.isEmpty) { + None + } else { + val cumulatedStats = rawStats.foldLeft(Map.empty[String, Double]) { (acc, stat) => + val opTotalExecutionTime = acc.getOrElse(stat.getOperatorId, 0.0) + acc + (stat.getOperatorId -> (opTotalExecutionTime + (stat.getDataProcessingTime + .doubleValue() + stat.getControlProcessingTime.doubleValue()) / 1e9)) + } + Some(cumulatedStats) + } + } + operatorEstimatedTimeOption + } +} diff --git a/core/amber/src/test/scala/edu/uci/ics/amber/engine/architecture/scheduling/DefaultCostEstimatorSpec.scala b/core/amber/src/test/scala/edu/uci/ics/amber/engine/architecture/scheduling/DefaultCostEstimatorSpec.scala new file mode 100644 index 00000000000..636a82d7dc8 --- /dev/null +++ b/core/amber/src/test/scala/edu/uci/ics/amber/engine/architecture/scheduling/DefaultCostEstimatorSpec.scala @@ -0,0 +1,258 @@ +package edu.uci.ics.amber.engine.architecture.scheduling + +import edu.uci.ics.amber.core.workflow.{PortIdentity, WorkflowContext} +import edu.uci.ics.amber.engine.common.virtualidentity.util.CONTROLLER +import edu.uci.ics.amber.engine.e2e.TestUtils.buildWorkflow +import edu.uci.ics.amber.operator.TestOperators +import edu.uci.ics.amber.operator.aggregate.{AggregateOpDesc, AggregationFunction} +import edu.uci.ics.amber.operator.keywordSearch.KeywordSearchOpDesc +import edu.uci.ics.amber.operator.source.scan.csv.CSVScanSourceOpDesc +import edu.uci.ics.texera.dao.MockTexeraDB +import edu.uci.ics.texera.dao.jooq.generated.enums.UserRole +import edu.uci.ics.texera.dao.jooq.generated.tables.daos._ +import edu.uci.ics.texera.dao.jooq.generated.tables.pojos._ +import edu.uci.ics.texera.workflow.LogicalLink +import org.jooq.types.{UInteger, ULong} +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach} + +import scala.jdk.CollectionConverters.CollectionHasAsScala + +class DefaultCostEstimatorSpec + extends AnyFlatSpec + with BeforeAndAfterAll + with BeforeAndAfterEach + with MockTexeraDB { + + private val headerlessCsvOpDesc: CSVScanSourceOpDesc = + TestOperators.headerlessSmallCsvScanOpDesc() + private val keywordOpDesc: KeywordSearchOpDesc = + TestOperators.keywordSearchOpDesc("column-1", "Asia") + private val groupByOpDesc: AggregateOpDesc = + TestOperators.aggregateAndGroupByDesc("column-1", AggregationFunction.COUNT, List[String]()) + + private val testUser: User = { + val user = new User + user.setUid(UInteger.valueOf(1)) + user.setName("test_user") + user.setRole(UserRole.ADMIN) + user.setPassword("123") + user.setEmail("test_user@test.com") + user + } + + private val testWorkflowEntry: Workflow = { + val workflow = new Workflow + workflow.setName("test workflow") + workflow.setWid(UInteger.valueOf(1)) + workflow.setContent("test workflow content") + workflow.setDescription("test description") + workflow + } + + private val testWorkflowVersionEntry: WorkflowVersion = { + val workflowVersion = new WorkflowVersion + workflowVersion.setWid(UInteger.valueOf(1)) + workflowVersion.setVid(UInteger.valueOf(1)) + workflowVersion.setContent("test version content") + workflowVersion + } + + private val testWorkflowExecutionEntry: WorkflowExecutions = { + val workflowExecution = new WorkflowExecutions + workflowExecution.setEid(UInteger.valueOf(1)) + workflowExecution.setVid(UInteger.valueOf(1)) + workflowExecution.setUid(UInteger.valueOf(1)) + workflowExecution.setStatus(3.toByte) + workflowExecution.setEnvironmentVersion("test engine") + workflowExecution + } + + private val headerlessCsvOpStatisticsEntry: WorkflowRuntimeStatistics = { + val workflowRuntimeStatistics = new WorkflowRuntimeStatistics + workflowRuntimeStatistics.setOperatorId(headerlessCsvOpDesc.operatorIdentifier.id) + workflowRuntimeStatistics.setWorkflowId(UInteger.valueOf(1)) + workflowRuntimeStatistics.setExecutionId(UInteger.valueOf(1)) + workflowRuntimeStatistics.setDataProcessingTime(ULong.valueOf(100)) + workflowRuntimeStatistics.setControlProcessingTime(ULong.valueOf(100)) + workflowRuntimeStatistics + } + + private val keywordOpDescStatisticsEntry: WorkflowRuntimeStatistics = { + val workflowRuntimeStatistics = new WorkflowRuntimeStatistics + workflowRuntimeStatistics.setOperatorId(keywordOpDesc.operatorIdentifier.id) + workflowRuntimeStatistics.setWorkflowId(UInteger.valueOf(1)) + workflowRuntimeStatistics.setExecutionId(UInteger.valueOf(1)) + workflowRuntimeStatistics.setDataProcessingTime(ULong.valueOf(300)) + workflowRuntimeStatistics.setControlProcessingTime(ULong.valueOf(300)) + workflowRuntimeStatistics + } + + private val groupByOpDescStatisticsEntry: WorkflowRuntimeStatistics = { + val workflowRuntimeStatistics = new WorkflowRuntimeStatistics + workflowRuntimeStatistics.setOperatorId(groupByOpDesc.operatorIdentifier.id) + workflowRuntimeStatistics.setWorkflowId(UInteger.valueOf(1)) + workflowRuntimeStatistics.setExecutionId(UInteger.valueOf(1)) + workflowRuntimeStatistics.setDataProcessingTime(ULong.valueOf(1000)) + workflowRuntimeStatistics.setControlProcessingTime(ULong.valueOf(1000)) + workflowRuntimeStatistics + } + + override protected def beforeEach(): Unit = { + initializeDBAndReplaceDSLContext() + } + + "DefaultCostEstimator" should "use fallback method when no past statistics are available" in { + val workflow = buildWorkflow( + List(headerlessCsvOpDesc, keywordOpDesc), + List( + LogicalLink( + headerlessCsvOpDesc.operatorIdentifier, + PortIdentity(0), + keywordOpDesc.operatorIdentifier, + PortIdentity(0) + ) + ), + new WorkflowContext() + ) + + val costEstimator = new DefaultCostEstimator( + workflow.context, + CONTROLLER + ) + + val region = Region( + id = RegionIdentity(0), + physicalOps = workflow.physicalPlan.operators, + physicalLinks = workflow.physicalPlan.links + ) + + val costOfRegion = costEstimator.estimate(region, 1) + + assert(costOfRegion == 0) + } + + "DefaultCostEstimator" should "use the latest successful execution to estimate cost when available" in { + val workflow = buildWorkflow( + List(headerlessCsvOpDesc, keywordOpDesc), + List( + LogicalLink( + headerlessCsvOpDesc.operatorIdentifier, + PortIdentity(0), + keywordOpDesc.operatorIdentifier, + PortIdentity(0) + ) + ), + new WorkflowContext() + ) + + val userDao = new UserDao(getDSLContext.configuration()) + val workflowDao = new WorkflowDao(getDSLContext.configuration()) + val workflowExecutionsDao = new WorkflowExecutionsDao(getDSLContext.configuration()) + val workflowVersionDao = new WorkflowVersionDao(getDSLContext.configuration()) + val workflowRuntimeStatisticsDao = + new WorkflowRuntimeStatisticsDao(getDSLContext.configuration()) + + userDao.insert(testUser) + workflowDao.insert(testWorkflowEntry) + workflowVersionDao.insert(testWorkflowVersionEntry) + workflowExecutionsDao.insert(testWorkflowExecutionEntry) + workflowRuntimeStatisticsDao.insert(headerlessCsvOpStatisticsEntry) + workflowRuntimeStatisticsDao.insert(keywordOpDescStatisticsEntry) + + val costEstimator = new DefaultCostEstimator( + workflow.context, + CONTROLLER + ) + + val region = Region( + id = RegionIdentity(0), + physicalOps = workflow.physicalPlan.operators, + physicalLinks = workflow.physicalPlan.links + ) + + val costOfRegion = costEstimator.estimate(region, 1) + + assert(costOfRegion != 0) + } + + "DefaultCostEstimator" should "use correctly estimate costs in a search" in { + val workflow = buildWorkflow( + List(headerlessCsvOpDesc, groupByOpDesc, keywordOpDesc), + List( + LogicalLink( + headerlessCsvOpDesc.operatorIdentifier, + PortIdentity(0), + groupByOpDesc.operatorIdentifier, + PortIdentity(0) + ), + LogicalLink( + groupByOpDesc.operatorIdentifier, + PortIdentity(0), + keywordOpDesc.operatorIdentifier, + PortIdentity(0) + ) + ), + new WorkflowContext() + ) + + val userDao = new UserDao(getDSLContext.configuration()) + val workflowDao = new WorkflowDao(getDSLContext.configuration()) + val workflowExecutionsDao = new WorkflowExecutionsDao(getDSLContext.configuration()) + val workflowVersionDao = new WorkflowVersionDao(getDSLContext.configuration()) + val workflowRuntimeStatisticsDao = + new WorkflowRuntimeStatisticsDao(getDSLContext.configuration()) + + userDao.insert(testUser) + workflowDao.insert(testWorkflowEntry) + workflowVersionDao.insert(testWorkflowVersionEntry) + workflowExecutionsDao.insert(testWorkflowExecutionEntry) + workflowRuntimeStatisticsDao.insert(headerlessCsvOpStatisticsEntry) + workflowRuntimeStatisticsDao.insert(groupByOpDescStatisticsEntry) + workflowRuntimeStatisticsDao.insert(keywordOpDescStatisticsEntry) + + // Should contain two regions, one with CSV->localAgg->globalAgg, another with keyword->sink + val searchResult = new CostBasedScheduleGenerator( + workflow.context, + workflow.physicalPlan, + CONTROLLER + ).bottomUpSearch() + + val groupByRegion = + searchResult.regionDAG.vertexSet().asScala.filter(region => region.physicalOps.size == 3).head + val keywordRegion = + searchResult.regionDAG.vertexSet().asScala.filter(region => region.physicalOps.size == 2).head + + val costEstimator = new DefaultCostEstimator( + workflow.context, + CONTROLLER + ) + + val groupByRegionCost = costEstimator.estimate(groupByRegion, 1) + + val groupByOperatorCost = (groupByOpDescStatisticsEntry.getControlProcessingTime + .doubleValue() + groupByOpDescStatisticsEntry.getControlProcessingTime.doubleValue()) / 1e9 + + // The cost of the first region should be the cost of the GroupBy operator (note the two physical operators for + // the GroupBy logical operator have the same cost because we use logical operator in the statistics. + // The GroupBy operator has a longer running time. + assert(groupByRegionCost == groupByOperatorCost) + + val keywordRegionCost = costEstimator.estimate(keywordRegion, 1) + + val keywordOperatorCost = (keywordOpDescStatisticsEntry.getControlProcessingTime + .doubleValue() + keywordOpDescStatisticsEntry.getControlProcessingTime.doubleValue()) / 1e9 + + // The cost of the second region should be the cost of the keyword operator, since the sink operator has the same + // logical operator as the keyword operator. + assert(keywordRegionCost == keywordOperatorCost) + + // The cost of the region plan should be the sum of region costs + assert(searchResult.cost == groupByRegionCost + keywordRegionCost) + } + + override protected def afterEach(): Unit = { + shutdownDB() + } + +} From c2bef3abfd67cc7033ba2f6926aec3d4c0532b84 Mon Sep 17 00:00:00 2001 From: Yicong Huang <17627829+Yicong-Huang@users.noreply.github.com> Date: Wed, 1 Jan 2025 10:26:42 -0800 Subject: [PATCH 08/10] Simplify schema build (#3188) To simplify schema creation, this PR removes the Schema.builder() pattern and makes Schema immutable. All modifications now result in the creation of a new Schema instance. --- .../NetworkInputGatewaySpec.scala | 6 +- .../messaginglayer/OutputManagerSpec.scala | 4 +- .../RangeBasedShuffleSpec.scala | 6 +- .../architecture/worker/DPThreadSpec.scala | 6 +- .../worker/DataProcessorSpec.scala | 2 +- .../architecture/worker/WorkerSpec.scala | 6 +- .../engine/faulttolerance/LoggingSpec.scala | 4 +- .../uci/ics/amber/core/marker/Marker.scala | 11 +- .../uci/ics/amber/core/tuple/Attribute.java | 2 +- .../amber/core/tuple/AttributeTypeUtils.scala | 27 +- .../edu/uci/ics/amber/core/tuple/Schema.scala | 253 ++++++++--------- .../uci/ics/amber/core/tuple/TupleUtils.scala | 12 +- .../edu/uci/ics/amber/util/ArrowUtils.scala | 12 +- .../uci/ics/amber/core/tuple/SchemaSpec.scala | 258 ++++++++++++++++++ .../uci/ics/amber/core/tuple/TupleSpec.scala | 25 +- .../operator/SpecialPhysicalOpFactory.scala | 6 +- .../operator/aggregate/AggregateOpDesc.scala | 9 +- .../CartesianProductOpDesc.scala | 9 +- .../dictionary/DictionaryMatcherOpDesc.scala | 10 +- .../operator/hashJoin/HashJoinOpDesc.scala | 53 ++-- ...gingFaceIrisLogisticRegressionOpDesc.scala | 5 +- .../HuggingFaceSentimentAnalysisOpDesc.scala | 5 +- .../HuggingFaceSpamSMSDetectionOpDesc.scala | 5 +- .../HuggingFaceTextSummarizationOpDesc.scala | 5 +- .../intervalJoin/IntervalJoinOpDesc.scala | 20 +- .../Scorer/MachineLearningScorerOpDesc.scala | 19 +- .../base/SklearnAdvancedBaseDesc.scala | 11 +- .../projection/ProjectionOpDesc.scala | 30 +- .../sentiment/SentimentAnalysisOpDesc.scala | 14 +- .../sklearn/SklearnClassifierOpDesc.scala | 4 +- .../SklearnLinearRegressionOpDesc.scala | 4 +- .../sklearn/SklearnPredictionOpDesc.scala | 5 +- .../reddit/RedditSearchSourceOpDesc.scala | 42 ++- ...TwitterFullArchiveSearchSourceOpDesc.scala | 75 +++-- .../v2/TwitterSearchSourceOpDesc.scala | 74 +++-- .../source/fetcher/URLFetcherOpDesc.scala | 10 +- .../source/scan/FileScanSourceOpDesc.scala | 10 +- .../source/scan/csv/CSVScanSourceOpDesc.scala | 9 +- .../csv/ParallelCSVScanSourceOpDesc.scala | 16 +- .../scan/csvOld/CSVOldScanSourceOpDesc.scala | 16 +- .../scan/json/JSONLScanSourceOpDesc.scala | 10 +- .../scan/text/TextInputSourceOpDesc.scala | 7 +- .../operator/source/sql/SQLSourceOpDesc.scala | 47 ++-- .../sql/asterixdb/AsterixDBSourceOpDesc.scala | 15 +- .../operator/udf/java/JavaUDFOpDesc.scala | 17 +- .../DualInputPortsPythonUDFOpDescV2.scala | 20 +- .../python/PythonLambdaFunctionOpDesc.scala | 26 +- .../udf/python/PythonTableReducerOpDesc.scala | 8 +- .../udf/python/PythonUDFOpDescV2.scala | 23 +- .../source/PythonUDFSourceOpDescV2.scala | 8 +- .../ics/amber/operator/udf/r/RUDFOpDesc.scala | 22 +- .../operator/udf/r/RUDFSourceOpDesc.scala | 8 +- .../unneststring/UnnestStringOpDesc.scala | 14 +- .../visualization/DotPlot/DotPlotOpDesc.scala | 9 +- .../IcicleChart/IcicleChartOpDesc.scala | 9 +- .../ImageViz/ImageVisualizerOpDesc.scala | 9 +- .../ScatterMatrixChartOpDesc.scala | 9 +- .../barChart/BarChartOpDesc.scala | 9 +- .../visualization/boxPlot/BoxPlotOpDesc.scala | 9 +- .../bubbleChart/BubbleChartOpDesc.scala | 9 +- .../CandlestickChartOpDesc.scala | 9 +- .../ContinuousErrorBandsOpDesc.scala | 9 +- .../contourPlot/ContourPlotOpDesc.scala | 9 +- .../dumbbellPlot/DumbbellPlotOpDesc.scala | 9 +- .../FigureFactoryTableOpDesc.scala | 9 +- .../filledAreaPlot/FilledAreaPlotOpDesc.scala | 9 +- .../funnelPlot/FunnelPlotOpDesc.scala | 9 +- .../ganttChart/GanttChartOpDesc.scala | 9 +- .../visualization/heatMap/HeatMapOpDesc.scala | 9 +- .../hierarchychart/HierarchyChartOpDesc.scala | 9 +- .../histogram/HistogramChartOpDesc.scala | 9 +- .../visualization/htmlviz/HtmlVizOpDesc.scala | 7 +- .../lineChart/LineChartOpDesc.scala | 9 +- .../pieChart/PieChartOpDesc.scala | 9 +- .../quiverPlot/QuiverPlotOpDesc.scala | 9 +- .../sankeyDiagram/SankeyDiagramOpDesc.scala | 9 +- .../scatter3DChart/Scatter3dChartOpDesc.scala | 9 +- .../scatterplot/ScatterplotOpDesc.scala | 9 +- .../tablesChart/TablesPlotOpDesc.scala | 9 +- .../ternaryPlot/TernaryPlotOpDesc.scala | 9 +- .../visualization/urlviz/UrlVizOpDesc.scala | 7 +- .../waterfallChart/WaterfallChartOpDesc.scala | 9 +- .../wordCloud/WordCloudOpDesc.scala | 13 +- .../CartesianProductOpExecSpec.scala | 12 +- .../DictionaryMatcherOpExecSpec.scala | 4 +- .../difference/DifferenceOpExecSpec.scala | 12 +- .../distinct/DistinctOpExecSpec.scala | 4 +- .../filter/SpecializedFilterOpExecSpec.scala | 6 +- .../operator/hashJoin/HashJoinOpSpec.scala | 16 +- .../intersect/IntersectOpExecSpec.scala | 8 +- .../intervalJoin/IntervalOpExecSpec.scala | 11 +- .../KeywordSearchOpExecSpec.scala | 4 +- .../projection/ProjectionOpExecSpec.scala | 16 +- .../SortPartitionsOpExecSpec.scala | 4 +- .../SymmetricDifferenceOpExecSpec.scala | 12 +- .../typecasting/TypeCastingOpExecSpec.scala | 9 +- .../unneststring/UnnestStringOpExecSpec.scala | 4 +- .../uci/ics/amber/util/ArrowUtilsSpec.scala | 4 +- 98 files changed, 892 insertions(+), 829 deletions(-) create mode 100644 core/workflow-core/src/test/scala/edu/uci/ics/amber/core/tuple/SchemaSpec.scala diff --git a/core/amber/src/test/scala/edu/uci/ics/amber/engine/architecture/messaginglayer/NetworkInputGatewaySpec.scala b/core/amber/src/test/scala/edu/uci/ics/amber/engine/architecture/messaginglayer/NetworkInputGatewaySpec.scala index 2203ff7f2a9..c05fae417e4 100644 --- a/core/amber/src/test/scala/edu/uci/ics/amber/engine/architecture/messaginglayer/NetworkInputGatewaySpec.scala +++ b/core/amber/src/test/scala/edu/uci/ics/amber/engine/architecture/messaginglayer/NetworkInputGatewaySpec.scala @@ -10,13 +10,11 @@ class NetworkInputGatewaySpec extends AnyFlatSpec with MockFactory { private val fakeReceiverID = ActorVirtualIdentity("testReceiver") private val fakeSenderID = ActorVirtualIdentity("testSender") - private val channelId = ChannelIdentity(fakeSenderID, fakeReceiverID, false) + private val channelId = ChannelIdentity(fakeSenderID, fakeReceiverID, isControl = false) private val payloads = (0 until 4).map { i => DataFrame( Array( - TupleLike(i) enforceSchema ( - Schema.builder().add("field1", AttributeType.INTEGER).build() - ) + TupleLike(i) enforceSchema Schema().add("field1", AttributeType.INTEGER) ) ) }.toArray diff --git a/core/amber/src/test/scala/edu/uci/ics/amber/engine/architecture/messaginglayer/OutputManagerSpec.scala b/core/amber/src/test/scala/edu/uci/ics/amber/engine/architecture/messaginglayer/OutputManagerSpec.scala index 916f6cf3e81..4fb4e45ce8f 100644 --- a/core/amber/src/test/scala/edu/uci/ics/amber/engine/architecture/messaginglayer/OutputManagerSpec.scala +++ b/core/amber/src/test/scala/edu/uci/ics/amber/engine/architecture/messaginglayer/OutputManagerSpec.scala @@ -22,15 +22,13 @@ class OutputManagerSpec extends AnyFlatSpec with MockFactory { private val mockDataOutputPort = // scalafix:ok; need it for wiring purpose new NetworkOutputGateway(identifier, mockHandler) var counter: Int = 0 - val schema: Schema = Schema - .builder() + val schema: Schema = Schema() .add("field1", AttributeType.INTEGER) .add("field2", AttributeType.INTEGER) .add("field3", AttributeType.INTEGER) .add("field4", AttributeType.INTEGER) .add("field5", AttributeType.STRING) .add("field6", AttributeType.DOUBLE) - .build() def physicalOpId(): PhysicalOpIdentity = { counter += 1 diff --git a/core/amber/src/test/scala/edu/uci/ics/amber/engine/architecture/messaginglayer/RangeBasedShuffleSpec.scala b/core/amber/src/test/scala/edu/uci/ics/amber/engine/architecture/messaginglayer/RangeBasedShuffleSpec.scala index 59034227074..3c0bfbf6e8c 100644 --- a/core/amber/src/test/scala/edu/uci/ics/amber/engine/architecture/messaginglayer/RangeBasedShuffleSpec.scala +++ b/core/amber/src/test/scala/edu/uci/ics/amber/engine/architecture/messaginglayer/RangeBasedShuffleSpec.scala @@ -16,7 +16,7 @@ class RangeBasedShuffleSpec extends AnyFlatSpec with MockFactory { val fakeID5: ActorVirtualIdentity = ActorVirtualIdentity("rec5") val attr: Attribute = new Attribute("Attr1", AttributeType.INTEGER) - val schema: Schema = Schema.builder().add(attr).build() + val schema: Schema = Schema().add(attr) val partitioning: RangeBasedShufflePartitioning = RangeBasedShufflePartitioning( 400, @@ -82,7 +82,7 @@ class RangeBasedShuffleSpec extends AnyFlatSpec with MockFactory { val partitioner2: RangeBasedShufflePartitioner = RangeBasedShufflePartitioner(partitioning2) val doubleAttr: Attribute = new Attribute("Attr2", AttributeType.DOUBLE) - val doubleSchema: Schema = Schema.builder().add(doubleAttr).build() + val doubleSchema: Schema = Schema().add(doubleAttr) tuple = Tuple.builder(doubleSchema).add(doubleAttr, -90.5).build() idx = partitioner2.getBucketIndex(tuple) assert(idx.next() == 1) @@ -104,7 +104,7 @@ class RangeBasedShuffleSpec extends AnyFlatSpec with MockFactory { val partitioner3: RangeBasedShufflePartitioner = RangeBasedShufflePartitioner(partitioning3) val longAttr: Attribute = new Attribute("Attr3", AttributeType.LONG) - val longSchema: Schema = Schema.builder().add(longAttr).build() + val longSchema: Schema = Schema().add(longAttr) tuple = Tuple.builder(longSchema).add(longAttr, -90L).build() idx = partitioner3.getBucketIndex(tuple) assert(idx.next() == 1) diff --git a/core/amber/src/test/scala/edu/uci/ics/amber/engine/architecture/worker/DPThreadSpec.scala b/core/amber/src/test/scala/edu/uci/ics/amber/engine/architecture/worker/DPThreadSpec.scala index 8c8cedefcb7..ef33ecc4aaf 100644 --- a/core/amber/src/test/scala/edu/uci/ics/amber/engine/architecture/worker/DPThreadSpec.scala +++ b/core/amber/src/test/scala/edu/uci/ics/amber/engine/architecture/worker/DPThreadSpec.scala @@ -35,7 +35,7 @@ class DPThreadSpec extends AnyFlatSpec with MockFactory { private val executor = mock[OperatorExecutor] private val mockInputPortId = PortIdentity() - private val schema: Schema = Schema.builder().add("field1", AttributeType.INTEGER).build() + private val schema: Schema = Schema().add("field1", AttributeType.INTEGER) private val tuples: Array[Tuple] = (0 until 5000) .map(i => TupleLike(i).enforceSchema(schema)) .toArray @@ -167,7 +167,7 @@ class DPThreadSpec extends AnyFlatSpec with MockFactory { } "DP Thread" should "write determinant logs to local storage while processing" in { - val dp = new DataProcessor(workerId, x => {}) + val dp = new DataProcessor(workerId, _ => {}) dp.executor = executor val inputQueue = new LinkedBlockingQueue[DPInputQueueElement]() val anotherSenderWorkerId = ActorVirtualIdentity("another") @@ -183,7 +183,7 @@ class DPThreadSpec extends AnyFlatSpec with MockFactory { ) logStorage.deleteStorage() val logManager: ReplayLogManager = - ReplayLogManager.createLogManager(logStorage, "tmpLog", x => {}) + ReplayLogManager.createLogManager(logStorage, "tmpLog", _ => {}) val dpThread = new DPThread(workerId, dp, logManager, inputQueue) dpThread.start() tuples.foreach { x => diff --git a/core/amber/src/test/scala/edu/uci/ics/amber/engine/architecture/worker/DataProcessorSpec.scala b/core/amber/src/test/scala/edu/uci/ics/amber/engine/architecture/worker/DataProcessorSpec.scala index a3b62cfabca..2ef61da21e6 100644 --- a/core/amber/src/test/scala/edu/uci/ics/amber/engine/architecture/worker/DataProcessorSpec.scala +++ b/core/amber/src/test/scala/edu/uci/ics/amber/engine/architecture/worker/DataProcessorSpec.scala @@ -46,7 +46,7 @@ class DataProcessorSpec extends AnyFlatSpec with MockFactory with BeforeAndAfter private val outputPortId = PortIdentity() private val outputHandler = mock[Either[MainThreadDelegateMessage, WorkflowFIFOMessage] => Unit] private val adaptiveBatchingMonitor = mock[WorkerTimerService] - private val schema: Schema = Schema.builder().add("field1", AttributeType.INTEGER).build() + private val schema: Schema = Schema().add("field1", AttributeType.INTEGER) private val tuples: Array[Tuple] = (0 until 400) .map(i => TupleLike(i).enforceSchema(schema)) .toArray diff --git a/core/amber/src/test/scala/edu/uci/ics/amber/engine/architecture/worker/WorkerSpec.scala b/core/amber/src/test/scala/edu/uci/ics/amber/engine/architecture/worker/WorkerSpec.scala index 4fc14016ac1..85ac8aa0fbd 100644 --- a/core/amber/src/test/scala/edu/uci/ics/amber/engine/architecture/worker/WorkerSpec.scala +++ b/core/amber/src/test/scala/edu/uci/ics/amber/engine/architecture/worker/WorkerSpec.scala @@ -48,11 +48,11 @@ class WorkerSpec with MockFactory { def mkSchema(fields: Any*): Schema = { - val schemaBuilder = Schema.builder() + var schema = Schema() fields.indices.foreach { i => - schemaBuilder.add(new Attribute("field" + i, AttributeType.ANY)) + schema = schema.add(new Attribute("field" + i, AttributeType.ANY)) } - schemaBuilder.build() + schema } def mkTuple(fields: Any*): Tuple = { diff --git a/core/amber/src/test/scala/edu/uci/ics/amber/engine/faulttolerance/LoggingSpec.scala b/core/amber/src/test/scala/edu/uci/ics/amber/engine/faulttolerance/LoggingSpec.scala index fade388c4ff..a35a1d41d66 100644 --- a/core/amber/src/test/scala/edu/uci/ics/amber/engine/faulttolerance/LoggingSpec.scala +++ b/core/amber/src/test/scala/edu/uci/ics/amber/engine/faulttolerance/LoggingSpec.scala @@ -80,12 +80,10 @@ class LoggingSpec (0 to 400) .map(i => TupleLike(i, i.toString, i.toDouble).enforceSchema( - Schema - .builder() + Schema() .add("field1", AttributeType.INTEGER) .add("field2", AttributeType.STRING) .add("field3", AttributeType.DOUBLE) - .build() ) ) .toArray diff --git a/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/marker/Marker.scala b/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/marker/Marker.scala index 3c201bf51d0..300816c9da4 100644 --- a/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/marker/Marker.scala +++ b/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/marker/Marker.scala @@ -31,13 +31,10 @@ final case class State(tuple: Option[Tuple] = None, passToAllDownstream: Boolean def toTuple: Tuple = Tuple .builder( - Schema - .builder() - .add(data.map { - case (name, (attrType, _)) => - new Attribute(name, attrType) - }) - .build() + Schema(data.map { + case (name, (attrType, _)) => + new Attribute(name, attrType) + }.toList) ) .addSequentially(data.values.map(_._2).toArray) .build() diff --git a/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/tuple/Attribute.java b/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/tuple/Attribute.java index 9c7661d514e..643a61d7b12 100644 --- a/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/tuple/Attribute.java +++ b/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/tuple/Attribute.java @@ -38,7 +38,7 @@ public AttributeType getType() { @Override public String toString() { - return "edu.ics.uci.amber.model.tuple.model.Attribute[name=" + attributeName + ", type=" + attributeType + "]"; + return "Attribute[name=" + attributeName + ", type=" + attributeType + "]"; } @Override diff --git a/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/tuple/AttributeTypeUtils.scala b/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/tuple/AttributeTypeUtils.scala index 1e333b8d7de..0a08b2883ee 100644 --- a/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/tuple/AttributeTypeUtils.scala +++ b/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/tuple/AttributeTypeUtils.scala @@ -11,39 +11,36 @@ import scala.util.control.Exception.allCatch object AttributeTypeUtils extends Serializable { /** - * this loop check whether the current attribute in the array is the attribute for casting, - * if it is, change it to result type - * if it's not, remain the same type - * we need this loop to keep the order the same as the original + * This function checks whether the current attribute in the schema matches the selected attribute for casting. + * If it matches, its type is changed to the specified result type. + * If it doesn't match, the original type is retained. + * The order of attributes in the schema is preserved. + * * @param schema schema of data * @param attribute selected attribute * @param resultType casting type - * @return schema of data + * @return a new schema with the modified attribute type */ def SchemaCasting( schema: Schema, attribute: String, resultType: AttributeType ): Schema = { - // need a builder to maintain the order of original schema - val builder = Schema.builder() - val attributes: List[Attribute] = schema.getAttributes - // change the schema when meet selected attribute else remain the same - for (i <- attributes.indices) { - if (attributes.apply(i).getName.equals(attribute)) { + val updatedAttributes = schema.getAttributes.map { attr => + if (attr.getName == attribute) { resultType match { case AttributeType.STRING | AttributeType.INTEGER | AttributeType.DOUBLE | AttributeType.LONG | AttributeType.BOOLEAN | AttributeType.TIMESTAMP | AttributeType.BINARY => - builder.add(attribute, resultType) + new Attribute(attribute, resultType) // Cast to the specified result type case AttributeType.ANY | _ => - builder.add(attribute, attributes.apply(i).getType) + attr // Retain the original type for unsupported types } } else { - builder.add(attributes.apply(i).getName, attributes.apply(i).getType) + attr // Retain attributes that don't match the target } } - builder.build() + Schema(updatedAttributes) } /** diff --git a/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/tuple/Schema.scala b/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/tuple/Schema.scala index b85ac6dd82f..15f608fb808 100644 --- a/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/tuple/Schema.scala +++ b/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/tuple/Schema.scala @@ -4,27 +4,39 @@ import com.fasterxml.jackson.annotation.{JsonCreator, JsonIgnore, JsonProperty} import com.google.common.base.Preconditions.checkNotNull import scala.collection.immutable.ListMap -import scala.collection.mutable +/** + * Represents the schema of a tuple, consisting of a list of attributes. + * The schema is immutable, and any modifications result in a new Schema instance. + */ case class Schema @JsonCreator() ( - @JsonProperty(value = "attributes", required = true) attributes: List[Attribute] + @JsonProperty(value = "attributes", required = true) attributes: List[Attribute] = List() ) extends Serializable { checkNotNull(attributes) - val attributeIndex: Map[String, Int] = + // Maps attribute names (case-insensitive) to their indices in the schema. + private val attributeIndex: Map[String, Int] = attributes.view.map(_.getName.toLowerCase).zipWithIndex.toMap - def this(attrs: Attribute*) = { - this(attrs.toList) - } + def this(attrs: Attribute*) = this(attrs.toList) + /** + * Returns the list of attributes in the schema. + */ @JsonProperty(value = "attributes") def getAttributes: List[Attribute] = attributes + /** + * Returns a list of all attribute names in the schema. + */ @JsonIgnore def getAttributeNames: List[String] = attributes.map(_.getName) + /** + * Returns the index of a specified attribute by name. + * Throws an exception if the attribute is not found. + */ def getIndex(attributeName: String): Int = { if (!containsAttribute(attributeName)) { throw new RuntimeException(s"$attributeName is not contained in the schema") @@ -32,8 +44,14 @@ case class Schema @JsonCreator() ( attributeIndex(attributeName.toLowerCase) } + /** + * Retrieves an attribute by its name. + */ def getAttribute(attributeName: String): Attribute = attributes(getIndex(attributeName)) + /** + * Checks whether the schema contains an attribute with the specified name. + */ @JsonIgnore def containsAttribute(attributeName: String): Boolean = attributeIndex.contains(attributeName.toLowerCase) @@ -46,165 +64,122 @@ case class Schema @JsonCreator() ( result } - override def equals(obj: Any): Boolean = + override def equals(obj: Any): Boolean = { obj match { - case that: Schema => - this.attributes == that.attributes && this.attributeIndex == that.attributeIndex - case _ => false + case that: Schema => this.attributes == that.attributes + case _ => false } + } - override def toString: String = s"Schema[$attributes]" + override def toString: String = s"Schema[${attributes.map(_.toString).mkString(", ")}]" + /** + * Creates a new Schema containing only the specified attributes. + */ def getPartialSchema(attributeNames: List[String]): Schema = { Schema(attributeNames.map(name => getAttribute(name))) } /** - * This method converts to a Schema into a raw format, where each pair of attribute name and attribute type - * are represented as string. This is for serialization between languages. + * Converts the schema into a raw format where each attribute name + * and attribute type are represented as strings. Useful for serialization across languages. */ def toRawSchema: Map[String, String] = - getAttributes.foldLeft(ListMap[String, String]())((list, attr) => + attributes.foldLeft(ListMap[String, String]())((list, attr) => list + (attr.getName -> attr.getType.name()) ) -} - -object Schema { - def fromRawSchema(raw: Map[String, String]): Schema = { - Schema(raw.map { - case (name, attrType) => - new Attribute(name, AttributeType.valueOf(attrType)) - }.toList) - } - - def builder(): Builder = Builder() - - case class Builder(private var attributes: List[Attribute] = List.empty) { - private val attributeNames: mutable.Set[String] = mutable.Set.empty - - def add(attribute: Attribute): Builder = { - require(attribute != null, "edu.ics.uci.amber.model.tuple.model.Attribute cannot be null") - checkAttributeNotExists(attribute.getName) - attributes ::= attribute - attributeNames += attribute.getName.toLowerCase - this - } - - def add(attributeName: String, attributeType: AttributeType): Builder = { - add(new Attribute(attributeName, attributeType)) - this - } - - def add(attributes: Iterable[Attribute]): Builder = { - attributes.foreach(add) - this + /** + * Creates a new Schema by adding multiple attributes to the current schema. + * Throws an exception if any attribute name already exists in the schema. + */ + def add(attributesToAdd: Iterable[Attribute]): Schema = { + val existingNames = this.getAttributeNames.map(_.toLowerCase).toSet + val duplicateNames = attributesToAdd.map(_.getName.toLowerCase).toSet.intersect(existingNames) + + if (duplicateNames.nonEmpty) { + throw new RuntimeException( + s"Cannot add attributes with duplicate names: ${duplicateNames.mkString(", ")}" + ) } - def add(attributes: Attribute*): Builder = { - attributes.foreach(add) - this - } + val newAttributes = attributes ++ attributesToAdd + Schema(newAttributes) + } - def add(schema: Schema): Builder = { - checkNotNull(schema) - add(schema.getAttributes) - this - } + /** + * Creates a new Schema by adding multiple attributes. + * Accepts a variable number of `Attribute` arguments. + * Throws an exception if any attribute name already exists in the schema. + */ + def add(attributes: Attribute*): Schema = { + this.add(attributes) + } - def build(): Schema = Schema(attributes.reverse) - - /** - * Removes an attribute from the schema builder if it exists. - * - * @param attribute , the name of the attribute - * @return this Builder object - */ - def removeIfExists(attribute: String): Builder = { - checkNotNull(attribute) - attributes = attributes.filter((attr: Attribute) => !attr.getName.equalsIgnoreCase(attribute)) - attributeNames.remove(attribute.toLowerCase) - this + /** + * Creates a new Schema by adding a single attribute to the current schema. + * Throws an exception if the attribute name already exists in the schema. + */ + def add(attribute: Attribute): Schema = { + if (containsAttribute(attribute.getName)) { + throw new RuntimeException( + s"Attribute name '${attribute.getName}' already exists in the schema" + ) } + add(List(attribute)) + } - /** - * Removes the attributes from the schema builder if they exist. - * - * @param attributes , the names of the attributes - * @return this Builder object - */ - def removeIfExists(attributes: Iterable[String]): Builder = { - checkNotNull(attributes) - attributes.foreach((attr: String) => checkNotNull(attr)) - attributes.foreach((attr: String) => this.removeIfExists(attr)) - this - } + /** + * Creates a new Schema by adding an attribute with the specified name and type. + * Throws an exception if the attribute name already exists in the schema. + */ + def add(attributeName: String, attributeType: AttributeType): Schema = + add(new Attribute(attributeName, attributeType)) - /** - * Removes the attributes from the schema builder if they exist. - * - * @param attributes , the names of the attributes - * @return this Builder object - */ - def removeIfExists(attributes: String*): Builder = { - checkNotNull(attributes) - this.removeIfExists(attributes) - this - } + /** + * Creates a new Schema by merging it with another schema. + * Throws an exception if there are duplicate attribute names. + */ + def add(schema: Schema): Schema = { + add(schema.attributes) + } - /** - * Removes an attribute from the schema builder. - * Fails if the attribute does not exist. - * - * @param attribute , the name of the attribute - * @return this Builder object - */ - def remove(attribute: String): Builder = { - checkNotNull(attribute) - checkAttributeExists(attribute) - removeIfExists(attribute) - this + /** + * Creates a new Schema by removing attributes with the specified names. + * Throws an exception if any of the specified attributes do not exist in the schema. + */ + def remove(attributeNames: Iterable[String]): Schema = { + val attributesToRemove = attributeNames.map(_.toLowerCase).toSet + + // Check for non-existent attributes + val nonExistentAttributes = attributesToRemove.diff(attributes.map(_.getName.toLowerCase).toSet) + if (nonExistentAttributes.nonEmpty) { + throw new IllegalArgumentException( + s"Cannot remove non-existent attributes: ${nonExistentAttributes.mkString(", ")}" + ) } - /** - * Removes the attributes from the schema builder. - * Fails if an attributes does not exist. - */ - def remove(attributes: Iterable[String]): Builder = { - checkNotNull(attributes) - attributes.foreach(attrName => checkNotNull(attrName)) - attributes.foreach(this.checkAttributeExists) - this.removeIfExists(attributes) - this - } + val remainingAttributes = + attributes.filterNot(attr => attributesToRemove.contains(attr.getName.toLowerCase)) + Schema(remainingAttributes) + } - /** - * Removes the attributes from the schema builder. - * Fails if an attributes does not exist. - * - * @param attributes - * @return the builder itself - */ - def remove(attributes: String*): Builder = { - checkNotNull(attributes) - this.remove(attributes) - this - } + /** + * Creates a new Schema by removing a single attribute with the specified name. + */ + def remove(attributeName: String): Schema = remove(List(attributeName)) +} - private def checkAttributeNotExists(attributeName: String): Unit = { - if (attributeNames.contains(attributeName.toLowerCase)) { - throw new RuntimeException( - s"edu.ics.uci.amber.model.tuple.model.Attribute $attributeName already exists in the schema" - ) - } - } +object Schema { - private def checkAttributeExists(attributeName: String): Unit = { - if (!attributeNames.contains(attributeName.toLowerCase)) { - throw new RuntimeException( - s"edu.ics.uci.amber.model.tuple.model.Attribute $attributeName does not exist in the schema" - ) - } - } + /** + * Creates a Schema instance from a raw map representation. + * Each entry in the map contains an attribute name and its type as strings. + */ + def fromRawSchema(raw: Map[String, String]): Schema = { + Schema(raw.map { + case (name, attrType) => + new Attribute(name, AttributeType.valueOf(attrType)) + }.toList) } } diff --git a/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/tuple/TupleUtils.scala b/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/tuple/TupleUtils.scala index c8735782748..9323f10a5ac 100644 --- a/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/tuple/TupleUtils.scala +++ b/core/workflow-core/src/main/scala/edu/uci/ics/amber/core/tuple/TupleUtils.scala @@ -42,13 +42,11 @@ object TupleUtils { result.toArray })) - val schema = Schema - .builder() - .add( - sortedFieldNames.indices - .map(i => new Attribute(sortedFieldNames(i), attributeTypes(i))) - ) - .build() + val schema = Schema( + sortedFieldNames.indices + .map(i => new Attribute(sortedFieldNames(i), attributeTypes(i))) + .toList + ) try { val fields = scala.collection.mutable.ArrayBuffer.empty[Any] diff --git a/core/workflow-core/src/main/scala/edu/uci/ics/amber/util/ArrowUtils.scala b/core/workflow-core/src/main/scala/edu/uci/ics/amber/util/ArrowUtils.scala index ae205511d5b..19ac52b501d 100644 --- a/core/workflow-core/src/main/scala/edu/uci/ics/amber/util/ArrowUtils.scala +++ b/core/workflow-core/src/main/scala/edu/uci/ics/amber/util/ArrowUtils.scala @@ -78,13 +78,11 @@ object ArrowUtils extends LazyLogging { * @return A Texera Schema. */ def toTexeraSchema(arrowSchema: org.apache.arrow.vector.types.pojo.Schema): Schema = - Schema - .builder() - .add( - arrowSchema.getFields.asScala - .map(field => new Attribute(field.getName, toAttributeType(field.getType))) - ) - .build() + Schema( + arrowSchema.getFields.asScala.map { field => + new Attribute(field.getName, toAttributeType(field.getType)) + }.toList + ) /** * Converts an ArrowType into an AttributeType. diff --git a/core/workflow-core/src/test/scala/edu/uci/ics/amber/core/tuple/SchemaSpec.scala b/core/workflow-core/src/test/scala/edu/uci/ics/amber/core/tuple/SchemaSpec.scala new file mode 100644 index 00000000000..827a0d3264a --- /dev/null +++ b/core/workflow-core/src/test/scala/edu/uci/ics/amber/core/tuple/SchemaSpec.scala @@ -0,0 +1,258 @@ +package edu.uci.ics.amber.core.tuple + +import org.scalatest.flatspec.AnyFlatSpec + +class SchemaSpec extends AnyFlatSpec { + + "Schema" should "create an empty schema" in { + val schema = Schema() + assert(schema.getAttributes.isEmpty) + assert(schema.getAttributeNames.isEmpty) + } + + it should "create a schema with attributes of all types" in { + val schema = Schema( + List( + new Attribute("stringAttr", AttributeType.STRING), + new Attribute("integerAttr", AttributeType.INTEGER), + new Attribute("longAttr", AttributeType.LONG), + new Attribute("doubleAttr", AttributeType.DOUBLE), + new Attribute("booleanAttr", AttributeType.BOOLEAN), + new Attribute("timestampAttr", AttributeType.TIMESTAMP), + new Attribute("binaryAttr", AttributeType.BINARY) + ) + ) + assert( + schema.getAttributes == List( + new Attribute("stringAttr", AttributeType.STRING), + new Attribute("integerAttr", AttributeType.INTEGER), + new Attribute("longAttr", AttributeType.LONG), + new Attribute("doubleAttr", AttributeType.DOUBLE), + new Attribute("booleanAttr", AttributeType.BOOLEAN), + new Attribute("timestampAttr", AttributeType.TIMESTAMP), + new Attribute("binaryAttr", AttributeType.BINARY) + ) + ) + assert( + schema.getAttributeNames == List( + "stringAttr", + "integerAttr", + "longAttr", + "doubleAttr", + "booleanAttr", + "timestampAttr", + "binaryAttr" + ) + ) + } + + it should "add a single attribute using add(Attribute)" in { + val schema = Schema() + val updatedSchema = schema.add(new Attribute("id", AttributeType.INTEGER)) + + assert(updatedSchema.getAttributes == List(new Attribute("id", AttributeType.INTEGER))) + } + + it should "add multiple attributes using add(Attribute*)" in { + val schema = Schema() + val updatedSchema = schema.add( + new Attribute("stringAttr", AttributeType.STRING), + new Attribute("integerAttr", AttributeType.INTEGER), + new Attribute("longAttr", AttributeType.LONG) + ) + + assert( + updatedSchema.getAttributes == List( + new Attribute("stringAttr", AttributeType.STRING), + new Attribute("integerAttr", AttributeType.INTEGER), + new Attribute("longAttr", AttributeType.LONG) + ) + ) + } + + it should "add attributes from another schema using add(Schema)" in { + val schema1 = Schema(List(new Attribute("id", AttributeType.INTEGER))) + val schema2 = Schema(List(new Attribute("name", AttributeType.STRING))) + + val mergedSchema = schema1.add(schema2) + + assert( + mergedSchema.getAttributes == List( + new Attribute("id", AttributeType.INTEGER), + new Attribute("name", AttributeType.STRING) + ) + ) + } + + it should "add an attribute with name and type using add(String, AttributeType)" in { + val schema = Schema() + val updatedSchema = schema.add("id", AttributeType.INTEGER) + + assert(updatedSchema.getAttributes == List(new Attribute("id", AttributeType.INTEGER))) + } + + it should "remove an existing attribute" in { + val schema = Schema( + List( + new Attribute("id", AttributeType.INTEGER), + new Attribute("name", AttributeType.STRING) + ) + ) + + val updatedSchema = schema.remove("id") + + assert(updatedSchema.getAttributes == List(new Attribute("name", AttributeType.STRING))) + } + + it should "throw an exception when removing a non-existent attribute" in { + val schema = Schema( + List(new Attribute("id", AttributeType.INTEGER)) + ) + + val exception = intercept[IllegalArgumentException] { + schema.remove("name") + } + assert(exception.getMessage == "Cannot remove non-existent attributes: name") + } + + it should "retrieve an attribute by name" in { + val schema = Schema( + List( + new Attribute("id", AttributeType.INTEGER), + new Attribute("name", AttributeType.STRING) + ) + ) + + val attribute = schema.getAttribute("id") + + assert(attribute == new Attribute("id", AttributeType.INTEGER)) + } + + it should "throw an exception when retrieving a non-existent attribute" in { + val schema = Schema(List(new Attribute("id", AttributeType.INTEGER))) + + val exception = intercept[RuntimeException] { + schema.getAttribute("name") + } + assert(exception.getMessage == "name is not contained in the schema") + } + + it should "return a partial schema for attributes of all types" in { + val schema = Schema( + List( + new Attribute("stringAttr", AttributeType.STRING), + new Attribute("integerAttr", AttributeType.INTEGER), + new Attribute("booleanAttr", AttributeType.BOOLEAN), + new Attribute("doubleAttr", AttributeType.DOUBLE) + ) + ) + + val partialSchema = schema.getPartialSchema(List("stringAttr", "booleanAttr")) + + assert( + partialSchema.getAttributes == List( + new Attribute("stringAttr", AttributeType.STRING), + new Attribute("booleanAttr", AttributeType.BOOLEAN) + ) + ) + } + + it should "convert to raw schema and back for attributes of all types" in { + val schema = Schema( + List( + new Attribute("stringAttr", AttributeType.STRING), + new Attribute("integerAttr", AttributeType.INTEGER), + new Attribute("longAttr", AttributeType.LONG), + new Attribute("doubleAttr", AttributeType.DOUBLE), + new Attribute("booleanAttr", AttributeType.BOOLEAN), + new Attribute("timestampAttr", AttributeType.TIMESTAMP), + new Attribute("binaryAttr", AttributeType.BINARY) + ) + ) + + val rawSchema = schema.toRawSchema + assert( + rawSchema == Map( + "stringAttr" -> "STRING", + "integerAttr" -> "INTEGER", + "longAttr" -> "LONG", + "doubleAttr" -> "DOUBLE", + "booleanAttr" -> "BOOLEAN", + "timestampAttr" -> "TIMESTAMP", + "binaryAttr" -> "BINARY" + ) + ) + + val reconstructedSchema = Schema.fromRawSchema(rawSchema) + assert(reconstructedSchema == schema) + } + + it should "check if attributes exist in schema" in { + val schema = Schema( + List( + new Attribute("stringAttr", AttributeType.STRING), + new Attribute("integerAttr", AttributeType.INTEGER) + ) + ) + + assert(schema.containsAttribute("stringAttr")) + assert(!schema.containsAttribute("nonExistentAttr")) + } + + it should "return the index of an attribute by name" in { + val schema = Schema( + List( + new Attribute("id", AttributeType.INTEGER), + new Attribute("name", AttributeType.STRING) + ) + ) + + assert(schema.getIndex("id") == 0) + assert(schema.getIndex("name") == 1) + } + + it should "throw an exception when getting the index of a non-existent attribute" in { + val schema = Schema(List(new Attribute("id", AttributeType.INTEGER))) + + val exception = intercept[RuntimeException] { + schema.getIndex("name") + } + assert(exception.getMessage == "name is not contained in the schema") + } + + it should "compare schemas for equality" in { + val schema1 = Schema( + List( + new Attribute("id", AttributeType.INTEGER), + new Attribute("name", AttributeType.STRING) + ) + ) + val schema2 = Schema( + List( + new Attribute("id", AttributeType.INTEGER), + new Attribute("name", AttributeType.STRING) + ) + ) + val schema3 = Schema( + List( + new Attribute("id", AttributeType.INTEGER) + ) + ) + + assert(schema1 == schema2) + assert(schema1 != schema3) + } + + it should "return a proper string representation" in { + val schema = Schema( + List( + new Attribute("id", AttributeType.INTEGER), + new Attribute("name", AttributeType.STRING) + ) + ) + + assert( + schema.toString == "Schema[Attribute[name=id, type=integer], Attribute[name=name, type=string]]" + ) + } +} diff --git a/core/workflow-core/src/test/scala/edu/uci/ics/amber/core/tuple/TupleSpec.scala b/core/workflow-core/src/test/scala/edu/uci/ics/amber/core/tuple/TupleSpec.scala index f5941c22c85..b5a3897df75 100644 --- a/core/workflow-core/src/test/scala/edu/uci/ics/amber/core/tuple/TupleSpec.scala +++ b/core/workflow-core/src/test/scala/edu/uci/ics/amber/core/tuple/TupleSpec.scala @@ -18,20 +18,20 @@ class TupleSpec extends AnyFlatSpec { it should "create a tuple with capitalized attributeName" in { - val schema = Schema.builder().add(capitalizedStringAttribute).build() + val schema = Schema().add(capitalizedStringAttribute) val tuple = Tuple.builder(schema).add(capitalizedStringAttribute, "string-value").build() assert(tuple.getField("COL-string").asInstanceOf[String] == "string-value") } it should "create a tuple with capitalized attributeName, using addSequentially" in { - val schema = Schema.builder().add(capitalizedStringAttribute).build() + val schema = Schema().add(capitalizedStringAttribute) val tuple = Tuple.builder(schema).addSequentially(Array("string-value")).build() assert(tuple.getField("COL-string").asInstanceOf[String] == "string-value") } it should "create a tuple using new builder, based on another tuple using old builder" in { - val schema = Schema.builder().add(stringAttribute).build() + val schema = Schema().add(stringAttribute) val inputTuple = Tuple.builder(schema).addSequentially(Array("string-value")).build() val newTuple = Tuple.builder(inputTuple.getSchema).add(inputTuple).build() @@ -39,22 +39,21 @@ class TupleSpec extends AnyFlatSpec { } it should "fail when unknown attribute is added to tuple" in { - val schema = Schema.builder().add(stringAttribute).build() + val schema = Schema().add(stringAttribute) assertThrows[TupleBuildingException] { Tuple.builder(schema).add(integerAttribute, 1) } } it should "fail when tuple does not conform to complete schema" in { - val schema = Schema.builder().add(stringAttribute).add(integerAttribute).build() + val schema = Schema().add(stringAttribute).add(integerAttribute) assertThrows[TupleBuildingException] { Tuple.builder(schema).add(integerAttribute, 1).build() } } it should "fail when entire tuple passed in has extra attributes" in { - val inputSchema = - Schema.builder().add(stringAttribute).add(integerAttribute).add(boolAttribute).build() + val inputSchema = Schema().add(stringAttribute).add(integerAttribute).add(boolAttribute) val inputTuple = Tuple .builder(inputSchema) .add(integerAttribute, 1) @@ -62,7 +61,7 @@ class TupleSpec extends AnyFlatSpec { .add(boolAttribute, true) .build() - val outputSchema = Schema.builder().add(stringAttribute).add(integerAttribute).build() + val outputSchema = Schema().add(stringAttribute).add(integerAttribute) assertThrows[TupleBuildingException] { Tuple.builder(outputSchema).add(inputTuple).build() } @@ -70,7 +69,7 @@ class TupleSpec extends AnyFlatSpec { it should "not fail when entire tuple passed in has extra attributes and strictSchemaMatch is false" in { val inputSchema = - Schema.builder().add(stringAttribute).add(integerAttribute).add(boolAttribute).build() + Schema().add(stringAttribute).add(integerAttribute).add(boolAttribute) val inputTuple = Tuple .builder(inputSchema) .add(integerAttribute, 1) @@ -78,7 +77,7 @@ class TupleSpec extends AnyFlatSpec { .add(boolAttribute, true) .build() - val outputSchema = Schema.builder().add(stringAttribute).add(integerAttribute).build() + val outputSchema = Schema().add(stringAttribute).add(integerAttribute) val outputTuple = Tuple.builder(outputSchema).add(inputTuple, false).build() // This is the important test. Input tuple has 3 attributes but output tuple has only 2 @@ -88,7 +87,7 @@ class TupleSpec extends AnyFlatSpec { it should "produce identical strings" in { val inputSchema = - Schema.builder().add(stringAttribute).add(integerAttribute).add(boolAttribute).build() + Schema().add(stringAttribute).add(integerAttribute).add(boolAttribute) val inputTuple = Tuple .builder(inputSchema) .add(integerAttribute, 1) @@ -104,8 +103,7 @@ class TupleSpec extends AnyFlatSpec { it should "calculate hash" in { val inputSchema = - Schema - .builder() + Schema() .add(integerAttribute) .add(stringAttribute) .add(boolAttribute) @@ -113,7 +111,6 @@ class TupleSpec extends AnyFlatSpec { .add(doubleAttribute) .add(timestampAttribute) .add(binaryAttribute) - .build() val inputTuple = Tuple .builder(inputSchema) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/SpecialPhysicalOpFactory.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/SpecialPhysicalOpFactory.scala index e60040eb467..c7fd35a93bc 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/SpecialPhysicalOpFactory.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/SpecialPhysicalOpFactory.scala @@ -44,11 +44,7 @@ object SpecialPhysicalOpFactory { case SET_SNAPSHOT | SINGLE_SNAPSHOT => if (inputSchema.containsAttribute(ProgressiveUtils.insertRetractFlagAttr.getName)) { // with insert/retract delta: remove the flag column - Schema - .builder() - .add(inputSchema) - .remove(ProgressiveUtils.insertRetractFlagAttr.getName) - .build() + inputSchema.remove(ProgressiveUtils.insertRetractFlagAttr.getName) } else { // with insert-only delta: output schema is the same as input schema inputSchema diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/aggregate/AggregateOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/aggregate/AggregateOpDesc.scala index 0ea2557f4ef..80ad4892782 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/aggregate/AggregateOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/aggregate/AggregateOpDesc.scala @@ -54,15 +54,12 @@ class AggregateOpDesc extends LogicalOp { .withPropagateSchema( SchemaPropagationFunc(inputSchemas => { val inputSchema = inputSchemas(operatorInfo.inputPorts.head.id) - val outputSchema = Schema - .builder() - .add(groupByKeys.map(key => inputSchema.getAttribute(key)): _*) - .add( + val outputSchema = Schema( + groupByKeys.map(key => inputSchema.getAttribute(key)) ++ localAggregations.map(agg => agg.getAggregationAttribute(inputSchema.getAttribute(agg.attribute).getType) ) - ) - .build() + ) Map(PortIdentity(internal = true) -> outputSchema) }) ) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/cartesianProduct/CartesianProductOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/cartesianProduct/CartesianProductOpDesc.scala index c17a94e3a40..ca6db486b43 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/cartesianProduct/CartesianProductOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/cartesianProduct/CartesianProductOpDesc.scala @@ -39,12 +39,12 @@ class CartesianProductOpDesc extends LogicalOp { // In this example, the last attribute from the right schema (`dup`) is renamed to `dup#@3` // to avoid conflicts. - val builder = Schema.builder() + var outputSchema = Schema() val leftSchema = inputSchemas(operatorInfo.inputPorts.head.id) val rightSchema = inputSchemas(operatorInfo.inputPorts.last.id) val leftAttributeNames = leftSchema.getAttributeNames val rightAttributeNames = rightSchema.getAttributeNames - builder.add(leftSchema) + outputSchema = outputSchema.add(leftSchema) rightSchema.getAttributes.foreach(attr => { var newName = attr.getName while ( @@ -56,13 +56,12 @@ class CartesianProductOpDesc extends LogicalOp { } if (newName == attr.getName) { // non-duplicate attribute, add to builder as is - builder.add(attr) + outputSchema = outputSchema.add(attr) } else { // renamed the duplicate attribute, construct new Attribute - builder.add(new Attribute(newName, attr.getType)) + outputSchema = outputSchema.add(new Attribute(newName, attr.getType)) } }) - val outputSchema = builder.build() Map(operatorInfo.outputPorts.head.id -> outputSchema) }) ) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/dictionary/DictionaryMatcherOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/dictionary/DictionaryMatcherOpDesc.scala index 2a82b03d10b..3b5a60f2220 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/dictionary/DictionaryMatcherOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/dictionary/DictionaryMatcherOpDesc.scala @@ -2,7 +2,8 @@ package edu.uci.ics.amber.operator.dictionary import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import edu.uci.ics.amber.core.executor.OpExecWithClassName -import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} +import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType} +import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.operator.map.MapOpDesc @@ -49,11 +50,8 @@ class DictionaryMatcherOpDesc extends MapOpDesc { SchemaPropagationFunc(inputSchemas => { if (resultAttribute == null || resultAttribute.trim.isEmpty) return null Map( - operatorInfo.outputPorts.head.id -> Schema - .builder() - .add(inputSchemas.values.head) - .add(resultAttribute, AttributeType.BOOLEAN) - .build() + operatorInfo.outputPorts.head.id -> inputSchemas.values.head + .add(new Attribute(resultAttribute, AttributeType.BOOLEAN)) ) }) ) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/hashJoin/HashJoinOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/hashJoin/HashJoinOpDesc.scala index 756f468f46d..9b8429e4e18 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/hashJoin/HashJoinOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/hashJoin/HashJoinOpDesc.scala @@ -4,14 +4,13 @@ import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.{JsonSchemaInject, JsonSchemaTitle} import edu.uci.ics.amber.core.executor.OpExecWithClassName import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} -import edu.uci.ics.amber.core.workflow._ -import edu.uci.ics.amber.operator.LogicalOp import edu.uci.ics.amber.core.virtualidentity.{ ExecutionIdentity, PhysicalOpIdentity, WorkflowIdentity } -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PhysicalLink, PortIdentity} +import edu.uci.ics.amber.core.workflow._ +import edu.uci.ics.amber.operator.LogicalOp import edu.uci.ics.amber.operator.hashJoin.HashJoinOpDesc.HASH_JOIN_INTERNAL_KEY_NAME import edu.uci.ics.amber.operator.metadata.annotations.{ AutofillAttributeName, @@ -79,11 +78,9 @@ class HashJoinOpDesc[K] extends LogicalOp { .withPropagateSchema( SchemaPropagationFunc(inputSchemas => Map( - PortIdentity(internal = true) -> Schema - .builder() - .add(HASH_JOIN_INTERNAL_KEY_NAME, AttributeType.ANY) - .add(inputSchemas(operatorInfo.inputPorts.head.id)) - .build() + PortIdentity(internal = true) -> Schema( + List(new Attribute(HASH_JOIN_INTERNAL_KEY_NAME, AttributeType.ANY)) + ).add(inputSchemas(operatorInfo.inputPorts.head.id)) ) ) ) @@ -121,27 +118,29 @@ class HashJoinOpDesc[K] extends LogicalOp { SchemaPropagationFunc(inputSchemas => { val buildSchema = inputSchemas(PortIdentity(internal = true)) val probeSchema = inputSchemas(PortIdentity(1)) - val builder = Schema.builder() - builder.add(buildSchema) - builder.removeIfExists(HASH_JOIN_INTERNAL_KEY_NAME) - val leftAttributeNames = buildSchema.getAttributeNames - val rightAttributeNames = - probeSchema.getAttributeNames.filterNot(name => name == probeAttributeName) - - // Create a Map from rightTuple's fields, renaming conflicts - rightAttributeNames - .foreach { name => - var newName = name - while ( - leftAttributeNames.contains(newName) || rightAttributeNames - .filter(attrName => name != attrName) - .contains(newName) - ) { - newName = s"$newName#@1" + + // Start with the attributes from the build schema, excluding the hash join internal key + val leftAttributes = + buildSchema.getAttributes.filterNot(_.getName == HASH_JOIN_INTERNAL_KEY_NAME) + val leftAttributeNames = leftAttributes.map(_.getName).toSet + + // Filter and rename attributes from the probe schema to avoid conflicts + val rightAttributes = probeSchema.getAttributes + .filterNot(_.getName == probeAttributeName) + .map { attr => + var newName = attr.getName + while (leftAttributeNames.contains(newName)) { + val suffixIndex = """#@(\d+)$""".r + .findFirstMatchIn(newName) + .map(_.group(1).toInt + 1) + .getOrElse(1) + newName = s"${attr.getName}#@$suffixIndex" } - builder.add(new Attribute(newName, probeSchema.getAttribute(name).getType)) + new Attribute(newName, attr.getType) } - val outputSchema = builder.build() + + // Combine left and right attributes into a new schema + val outputSchema = Schema(leftAttributes ++ rightAttributes) Map(PortIdentity() -> outputSchema) }) ) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/huggingFace/HuggingFaceIrisLogisticRegressionOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/huggingFace/HuggingFaceIrisLogisticRegressionOpDesc.scala index dcef5abf438..283f2168b10 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/huggingFace/HuggingFaceIrisLogisticRegressionOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/huggingFace/HuggingFaceIrisLogisticRegressionOpDesc.scala @@ -99,12 +99,9 @@ class HuggingFaceIrisLogisticRegressionOpDesc extends PythonOperatorDescriptor { ) throw new RuntimeException("Result attribute name should not be empty") Map( - operatorInfo.outputPorts.head.id -> Schema - .builder() - .add(inputSchemas(operatorInfo.inputPorts.head.id)) + operatorInfo.outputPorts.head.id -> inputSchemas(operatorInfo.inputPorts.head.id) .add(predictionClassName, AttributeType.STRING) .add(predictionProbabilityName, AttributeType.DOUBLE) - .build() ) } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/huggingFace/HuggingFaceSentimentAnalysisOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/huggingFace/HuggingFaceSentimentAnalysisOpDesc.scala index 5e9027951a9..875bc5d1b61 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/huggingFace/HuggingFaceSentimentAnalysisOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/huggingFace/HuggingFaceSentimentAnalysisOpDesc.scala @@ -87,13 +87,10 @@ class HuggingFaceSentimentAnalysisOpDesc extends PythonOperatorDescriptor { ) return null Map( - operatorInfo.outputPorts.head.id -> Schema - .builder() - .add(inputSchemas(operatorInfo.inputPorts.head.id)) + operatorInfo.outputPorts.head.id -> inputSchemas(operatorInfo.inputPorts.head.id) .add(resultAttributePositive, AttributeType.DOUBLE) .add(resultAttributeNeutral, AttributeType.DOUBLE) .add(resultAttributeNegative, AttributeType.DOUBLE) - .build() ) } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/huggingFace/HuggingFaceSpamSMSDetectionOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/huggingFace/HuggingFaceSpamSMSDetectionOpDesc.scala index 4257c17a6d5..d12dceffc40 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/huggingFace/HuggingFaceSpamSMSDetectionOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/huggingFace/HuggingFaceSpamSMSDetectionOpDesc.scala @@ -58,12 +58,9 @@ class HuggingFaceSpamSMSDetectionOpDesc extends PythonOperatorDescriptor { inputSchemas: Map[PortIdentity, Schema] ): Map[PortIdentity, Schema] = { Map( - operatorInfo.outputPorts.head.id -> Schema - .builder() - .add(inputSchemas.values.head) + operatorInfo.outputPorts.head.id -> inputSchemas.values.head .add(resultAttributeSpam, AttributeType.BOOLEAN) .add(resultAttributeProbability, AttributeType.DOUBLE) - .build() ) } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/huggingFace/HuggingFaceTextSummarizationOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/huggingFace/HuggingFaceTextSummarizationOpDesc.scala index e79369fb959..86b3059d36d 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/huggingFace/HuggingFaceTextSummarizationOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/huggingFace/HuggingFaceTextSummarizationOpDesc.scala @@ -63,11 +63,8 @@ class HuggingFaceTextSummarizationOpDesc extends PythonOperatorDescriptor { if (resultAttribute == null || resultAttribute.trim.isEmpty) throw new RuntimeException("Result attribute name should be given") Map( - operatorInfo.outputPorts.head.id -> Schema - .builder() - .add(inputSchemas.values.head) + operatorInfo.outputPorts.head.id -> inputSchemas.values.head .add(resultAttribute, AttributeType.STRING) - .build() ) } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/intervalJoin/IntervalJoinOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/intervalJoin/IntervalJoinOpDesc.scala index 764a42b2708..dd61c510a1a 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/intervalJoin/IntervalJoinOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/intervalJoin/IntervalJoinOpDesc.scala @@ -92,19 +92,21 @@ class IntervalJoinOpDesc extends LogicalOp { .withOutputPorts(operatorInfo.outputPorts) .withPropagateSchema( SchemaPropagationFunc(inputSchemas => { - val builder: Schema.Builder = Schema.builder() val leftTableSchema: Schema = inputSchemas(operatorInfo.inputPorts.head.id) val rightTableSchema: Schema = inputSchemas(operatorInfo.inputPorts.last.id) - builder.add(leftTableSchema) - rightTableSchema.getAttributes - .map(attr => { - if (leftTableSchema.containsAttribute(attr.getName)) { - builder.add(new Attribute(s"${attr.getName}#@1", attr.getType)) + + // Start with the left table schema + val outputSchema = rightTableSchema.getAttributes.foldLeft(leftTableSchema) { + (currentSchema, attr) => + if (currentSchema.containsAttribute(attr.getName)) { + // Add the attribute with a suffix to avoid conflicts + currentSchema.add(new Attribute(s"${attr.getName}#@1", attr.getType)) } else { - builder.add(attr.getName, attr.getType) + // Add the attribute as is + currentSchema.add(attr) } - }) - val outputSchema = builder.build() + } + Map(operatorInfo.outputPorts.head.id -> outputSchema) }) ) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/machineLearning/Scorer/MachineLearningScorerOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/machineLearning/Scorer/MachineLearningScorerOpDesc.scala index 62ca41b34eb..a73ceb3ae14 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/machineLearning/Scorer/MachineLearningScorerOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/machineLearning/Scorer/MachineLearningScorerOpDesc.scala @@ -67,21 +67,22 @@ class MachineLearningScorerOpDesc extends PythonOperatorDescriptor { override def getOutputSchemas( inputSchemas: Map[PortIdentity, Schema] ): Map[PortIdentity, Schema] = { - val outputSchemaBuilder = Schema.builder() - if (!isRegression) { - outputSchemaBuilder.add(new Attribute("Class", AttributeType.STRING)) - } - val metrics = if (isRegression) { regressionMetrics.map(_.getName()) } else { classificationMetrics.map(_.getName()) } - metrics.foreach(metricName => { - outputSchemaBuilder.add(new Attribute(metricName, AttributeType.DOUBLE)) - }) + val baseSchema = if (!isRegression) { + Schema(List(new Attribute("Class", AttributeType.STRING))) + } else { + Schema(List()) + } + + val outputSchema = metrics.foldLeft(baseSchema) { (currentSchema, metricName) => + currentSchema.add(new Attribute(metricName, AttributeType.DOUBLE)) + } - Map(operatorInfo.outputPorts.head.id -> outputSchemaBuilder.build()) + Map(operatorInfo.outputPorts.head.id -> outputSchema) } // private def getClassificationScorerName(scorer: classificationMetricsFnc): String = { diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/machineLearning/sklearnAdvanced/base/SklearnAdvancedBaseDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/machineLearning/sklearnAdvanced/base/SklearnAdvancedBaseDesc.scala index 0d35b6cbc85..5e035fd1b5c 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/machineLearning/sklearnAdvanced/base/SklearnAdvancedBaseDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/machineLearning/sklearnAdvanced/base/SklearnAdvancedBaseDesc.scala @@ -152,10 +152,13 @@ abstract class SklearnMLOperatorDescriptor[T <: ParamClass] extends PythonOperat override def getOutputSchemas( inputSchemas: Map[PortIdentity, Schema] ): Map[PortIdentity, Schema] = { - val outputSchemaBuilder = Schema.builder() - outputSchemaBuilder.add(new Attribute("Model", AttributeType.BINARY)) - outputSchemaBuilder.add(new Attribute("Parameters", AttributeType.STRING)) + val outputSchema = Schema( + List( + new Attribute("Model", AttributeType.BINARY), + new Attribute("Parameters", AttributeType.STRING) + ) + ) - Map(operatorInfo.outputPorts.head.id -> outputSchemaBuilder.build()) + Map(operatorInfo.outputPorts.head.id -> outputSchema) } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/projection/ProjectionOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/projection/ProjectionOpDesc.scala index 39183a07ea5..120b5996910 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/projection/ProjectionOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/projection/ProjectionOpDesc.scala @@ -1,17 +1,15 @@ package edu.uci.ics.amber.operator.projection import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} -import com.google.common.base.Preconditions import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle import edu.uci.ics.amber.core.executor.OpExecWithClassName import edu.uci.ics.amber.core.tuple.{Attribute, Schema} +import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import edu.uci.ics.amber.core.workflow.PhysicalOp.oneToOnePhysicalOp import edu.uci.ics.amber.core.workflow._ import edu.uci.ics.amber.operator.map.MapOpDesc import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.util.JSONUtils.objectMapper -import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} class ProjectionOpDesc extends MapOpDesc { @@ -39,27 +37,21 @@ class ProjectionOpDesc extends MapOpDesc { .withOutputPorts(operatorInfo.outputPorts) .withDerivePartition(derivePartition()) .withPropagateSchema(SchemaPropagationFunc(inputSchemas => { - Preconditions.checkArgument(attributes.nonEmpty) + require(attributes.nonEmpty, "Attributes must not be empty") + val inputSchema = inputSchemas.values.head val outputSchema = if (!isDrop) { - Schema - .builder() - .add(attributes.map { attribute => - val originalType = inputSchema.getAttribute(attribute.getOriginalAttribute).getType - new Attribute(attribute.getAlias, originalType) - }) - .build() + attributes.foldLeft(Schema()) { (schema, attribute) => + val originalType = inputSchema.getAttribute(attribute.getOriginalAttribute).getType + schema.add(attribute.getAlias, originalType) + } } else { - val outputSchemaBuilder = Schema.builder() - outputSchemaBuilder.add(inputSchema) - for (attribute <- attributes) { - outputSchemaBuilder.removeIfExists(attribute.getOriginalAttribute) + attributes.foldLeft(inputSchema) { (schema, attribute) => + schema.remove(attribute.getOriginalAttribute) } - outputSchemaBuilder.build() } - Map( - operatorInfo.outputPorts.head.id -> outputSchema - ) + + Map(operatorInfo.outputPorts.head.id -> outputSchema) })) } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sentiment/SentimentAnalysisOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sentiment/SentimentAnalysisOpDesc.scala index 155380851ba..0ef7545be8b 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sentiment/SentimentAnalysisOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sentiment/SentimentAnalysisOpDesc.scala @@ -3,14 +3,13 @@ package edu.uci.ics.amber.operator.sentiment import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaInject import edu.uci.ics.amber.core.executor.OpExecWithClassName -import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} -import edu.uci.ics.amber.core.workflow.{PhysicalOp, SchemaPropagationFunc} +import edu.uci.ics.amber.core.tuple.AttributeType +import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} +import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.operator.map.MapOpDesc -import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName +import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.util.JSONUtils.objectMapper -import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort} @JsonSchemaInject(json = """ { @@ -59,11 +58,8 @@ class SentimentAnalysisOpDesc extends MapOpDesc { if (resultAttribute == null || resultAttribute.trim.isEmpty) return null Map( - operatorInfo.outputPorts.head.id -> Schema - .builder() - .add(inputSchemas.values.head) + operatorInfo.outputPorts.head.id -> inputSchemas.values.head .add(resultAttribute, AttributeType.INTEGER) - .build() ) }) ) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sklearn/SklearnClassifierOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sklearn/SklearnClassifierOpDesc.scala index 2279f4126dd..190f271eb6e 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sklearn/SklearnClassifierOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sklearn/SklearnClassifierOpDesc.scala @@ -110,11 +110,9 @@ abstract class SklearnClassifierOpDesc extends PythonOperatorDescriptor { inputSchemas: Map[PortIdentity, Schema] ): Map[PortIdentity, Schema] = { Map( - operatorInfo.outputPorts.head.id -> Schema - .builder() + operatorInfo.outputPorts.head.id -> Schema() .add("model_name", AttributeType.STRING) .add("model", AttributeType.BINARY) - .build() ) } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sklearn/SklearnLinearRegressionOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sklearn/SklearnLinearRegressionOpDesc.scala index 35e0e7d4d9d..430b9208e3b 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sklearn/SklearnLinearRegressionOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sklearn/SklearnLinearRegressionOpDesc.scala @@ -63,11 +63,9 @@ class SklearnLinearRegressionOpDesc extends PythonOperatorDescriptor { inputSchemas: Map[PortIdentity, Schema] ): Map[PortIdentity, Schema] = { Map( - operatorInfo.outputPorts.head.id -> Schema - .builder() + operatorInfo.outputPorts.head.id -> Schema() .add("model_name", AttributeType.STRING) .add("model", AttributeType.BINARY) - .build() ) } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sklearn/SklearnPredictionOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sklearn/SklearnPredictionOpDesc.scala index 6e3c8ae5cd7..4653c1b6983 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sklearn/SklearnPredictionOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/sklearn/SklearnPredictionOpDesc.scala @@ -68,11 +68,8 @@ class SklearnPredictionOpDesc extends PythonOperatorDescriptor { inputSchema.attributes.find(attr => attr.getName == groundTruthAttribute).get.getType } Map( - operatorInfo.outputPorts.head.id -> Schema - .builder() - .add(inputSchema) + operatorInfo.outputPorts.head.id -> inputSchema .add(resultAttribute, resultType) - .build() ) } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/apis/reddit/RedditSearchSourceOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/apis/reddit/RedditSearchSourceOpDesc.scala index 6213cc26ded..3ffe3d63594 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/apis/reddit/RedditSearchSourceOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/apis/reddit/RedditSearchSourceOpDesc.scala @@ -2,7 +2,7 @@ package edu.uci.ics.amber.operator.source.apis.reddit import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle -import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} +import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.source.PythonSourceOperatorDescriptor import edu.uci.ics.amber.core.workflow.{OutputPort, PortIdentity} @@ -112,28 +112,24 @@ class RedditSearchSourceOpDesc extends PythonSourceOperatorDescriptor { override def asSource() = true override def sourceSchema(): Schema = - Schema - .builder() - .add( - new Attribute("id", AttributeType.STRING), - new Attribute("name", AttributeType.STRING), - new Attribute("title", AttributeType.STRING), - new Attribute("created_utc", AttributeType.TIMESTAMP), - new Attribute("edited", AttributeType.TIMESTAMP), - new Attribute("is_self", AttributeType.BOOLEAN), - new Attribute("selftext", AttributeType.STRING), - new Attribute("over_18", AttributeType.BOOLEAN), - new Attribute("is_original_content", AttributeType.BOOLEAN), - new Attribute("locked", AttributeType.BOOLEAN), - new Attribute("score", AttributeType.INTEGER), - new Attribute("upvote_ratio", AttributeType.DOUBLE), - new Attribute("num_comments", AttributeType.INTEGER), - new Attribute("permalink", AttributeType.STRING), - new Attribute("url", AttributeType.STRING), - new Attribute("author_name", AttributeType.STRING), - new Attribute("subreddit", AttributeType.STRING) - ) - .build() + Schema() + .add("id", AttributeType.STRING) + .add("name", AttributeType.STRING) + .add("title", AttributeType.STRING) + .add("created_utc", AttributeType.TIMESTAMP) + .add("edited", AttributeType.TIMESTAMP) + .add("is_self", AttributeType.BOOLEAN) + .add("selftext", AttributeType.STRING) + .add("over_18", AttributeType.BOOLEAN) + .add("is_original_content", AttributeType.BOOLEAN) + .add("locked", AttributeType.BOOLEAN) + .add("score", AttributeType.INTEGER) + .add("upvote_ratio", AttributeType.DOUBLE) + .add("num_comments", AttributeType.INTEGER) + .add("permalink", AttributeType.STRING) + .add("url", AttributeType.STRING) + .add("author_name", AttributeType.STRING) + .add("subreddit", AttributeType.STRING) def getOutputSchemas(inputSchemas: Map[PortIdentity, Schema]): Map[PortIdentity, Schema] = { Map(operatorInfo.outputPorts.head.id -> sourceSchema()) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/apis/twitter/v2/TwitterFullArchiveSearchSourceOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/apis/twitter/v2/TwitterFullArchiveSearchSourceOpDesc.scala index c3a92cbcadd..d4050625b1b 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/apis/twitter/v2/TwitterFullArchiveSearchSourceOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/apis/twitter/v2/TwitterFullArchiveSearchSourceOpDesc.scala @@ -7,7 +7,7 @@ import com.kjetland.jackson.jsonSchema.annotations.{ JsonSchemaTitle } import edu.uci.ics.amber.core.executor.OpExecWithClassName -import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} +import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} import edu.uci.ics.amber.core.workflow.{PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.operator.metadata.annotations.UIWidget import edu.uci.ics.amber.operator.source.apis.twitter.TwitterSourceOpDesc @@ -65,44 +65,39 @@ class TwitterFullArchiveSearchSourceOpDesc extends TwitterSourceOpDesc { // twitter schema is hard coded for now. V2 API has changed many fields of the Tweet object. // we are also currently depending on redouane59/twittered client library to parse tweet fields. - - Schema - .builder() - .add( - new Attribute("id", AttributeType.STRING), - new Attribute("text", AttributeType.STRING), - new Attribute("created_at", AttributeType.TIMESTAMP), - new Attribute("lang", AttributeType.STRING), - new Attribute("tweet_type", AttributeType.STRING), - new Attribute("place_id", AttributeType.STRING), - new Attribute("place_coordinate", AttributeType.STRING), - new Attribute("in_reply_to_status_id", AttributeType.STRING), - new Attribute("in_reply_to_user_id", AttributeType.STRING), - new Attribute("like_count", AttributeType.LONG), - new Attribute("quote_count", AttributeType.LONG), - new Attribute("reply_count", AttributeType.LONG), - new Attribute("retweet_count", AttributeType.LONG), - new Attribute("hashtags", AttributeType.STRING), - new Attribute("symbols", AttributeType.STRING), - new Attribute("urls", AttributeType.STRING), - new Attribute("mentions", AttributeType.STRING), - new Attribute("user_id", AttributeType.STRING), - new Attribute("user_created_at", AttributeType.TIMESTAMP), - new Attribute("user_name", AttributeType.STRING), - new Attribute("user_display_name", AttributeType.STRING), - new Attribute("user_lang", AttributeType.STRING), - new Attribute("user_description", AttributeType.STRING), - new Attribute("user_followers_count", AttributeType.LONG), - new Attribute("user_following_count", AttributeType.LONG), - new Attribute("user_tweet_count", AttributeType.LONG), - new Attribute("user_listed_count", AttributeType.LONG), - new Attribute("user_location", AttributeType.STRING), - new Attribute("user_url", AttributeType.STRING), - new Attribute("user_profile_image_url", AttributeType.STRING), - new Attribute("user_pinned_tweet_id", AttributeType.STRING), - new Attribute("user_protected", AttributeType.BOOLEAN), - new Attribute("user_verified", AttributeType.BOOLEAN) - ) - .build() + Schema() + .add("id", AttributeType.STRING) + .add("text", AttributeType.STRING) + .add("created_at", AttributeType.TIMESTAMP) + .add("lang", AttributeType.STRING) + .add("tweet_type", AttributeType.STRING) + .add("place_id", AttributeType.STRING) + .add("place_coordinate", AttributeType.STRING) + .add("in_reply_to_status_id", AttributeType.STRING) + .add("in_reply_to_user_id", AttributeType.STRING) + .add("like_count", AttributeType.LONG) + .add("quote_count", AttributeType.LONG) + .add("reply_count", AttributeType.LONG) + .add("retweet_count", AttributeType.LONG) + .add("hashtags", AttributeType.STRING) + .add("symbols", AttributeType.STRING) + .add("urls", AttributeType.STRING) + .add("mentions", AttributeType.STRING) + .add("user_id", AttributeType.STRING) + .add("user_created_at", AttributeType.TIMESTAMP) + .add("user_name", AttributeType.STRING) + .add("user_display_name", AttributeType.STRING) + .add("user_lang", AttributeType.STRING) + .add("user_description", AttributeType.STRING) + .add("user_followers_count", AttributeType.LONG) + .add("user_following_count", AttributeType.LONG) + .add("user_tweet_count", AttributeType.LONG) + .add("user_listed_count", AttributeType.LONG) + .add("user_location", AttributeType.STRING) + .add("user_url", AttributeType.STRING) + .add("user_profile_image_url", AttributeType.STRING) + .add("user_pinned_tweet_id", AttributeType.STRING) + .add("user_protected", AttributeType.BOOLEAN) + .add("user_verified", AttributeType.BOOLEAN) } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/apis/twitter/v2/TwitterSearchSourceOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/apis/twitter/v2/TwitterSearchSourceOpDesc.scala index 15b0ddfaf21..20960c3181d 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/apis/twitter/v2/TwitterSearchSourceOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/apis/twitter/v2/TwitterSearchSourceOpDesc.scala @@ -7,7 +7,7 @@ import com.kjetland.jackson.jsonSchema.annotations.{ JsonSchemaTitle } import edu.uci.ics.amber.core.executor.OpExecWithClassName -import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} +import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} import edu.uci.ics.amber.core.workflow.{PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.operator.metadata.annotations.UIWidget import edu.uci.ics.amber.operator.source.apis.twitter.TwitterSourceOpDesc @@ -56,43 +56,39 @@ class TwitterSearchSourceOpDesc extends TwitterSourceOpDesc { // twitter schema is hard coded for now. V2 API has changed many fields of the Tweet object. // we are also currently depending on redouane59/twittered client library to parse tweet fields. - Schema - .builder() - .add( - new Attribute("id", AttributeType.STRING), - new Attribute("text", AttributeType.STRING), - new Attribute("created_at", AttributeType.TIMESTAMP), - new Attribute("lang", AttributeType.STRING), - new Attribute("tweet_type", AttributeType.STRING), - new Attribute("place_id", AttributeType.STRING), - new Attribute("place_coordinate", AttributeType.STRING), - new Attribute("in_reply_to_status_id", AttributeType.STRING), - new Attribute("in_reply_to_user_id", AttributeType.STRING), - new Attribute("like_count", AttributeType.LONG), - new Attribute("quote_count", AttributeType.LONG), - new Attribute("reply_count", AttributeType.LONG), - new Attribute("retweet_count", AttributeType.LONG), - new Attribute("hashtags", AttributeType.STRING), - new Attribute("symbols", AttributeType.STRING), - new Attribute("urls", AttributeType.STRING), - new Attribute("mentions", AttributeType.STRING), - new Attribute("user_id", AttributeType.STRING), - new Attribute("user_created_at", AttributeType.TIMESTAMP), - new Attribute("user_name", AttributeType.STRING), - new Attribute("user_display_name", AttributeType.STRING), - new Attribute("user_lang", AttributeType.STRING), - new Attribute("user_description", AttributeType.STRING), - new Attribute("user_followers_count", AttributeType.LONG), - new Attribute("user_following_count", AttributeType.LONG), - new Attribute("user_tweet_count", AttributeType.LONG), - new Attribute("user_listed_count", AttributeType.LONG), - new Attribute("user_location", AttributeType.STRING), - new Attribute("user_url", AttributeType.STRING), - new Attribute("user_profile_image_url", AttributeType.STRING), - new Attribute("user_pinned_tweet_id", AttributeType.STRING), - new Attribute("user_protected", AttributeType.BOOLEAN), - new Attribute("user_verified", AttributeType.BOOLEAN) - ) - .build() + Schema() + .add("id", AttributeType.STRING) + .add("text", AttributeType.STRING) + .add("created_at", AttributeType.TIMESTAMP) + .add("lang", AttributeType.STRING) + .add("tweet_type", AttributeType.STRING) + .add("place_id", AttributeType.STRING) + .add("place_coordinate", AttributeType.STRING) + .add("in_reply_to_status_id", AttributeType.STRING) + .add("in_reply_to_user_id", AttributeType.STRING) + .add("like_count", AttributeType.LONG) + .add("quote_count", AttributeType.LONG) + .add("reply_count", AttributeType.LONG) + .add("retweet_count", AttributeType.LONG) + .add("hashtags", AttributeType.STRING) + .add("symbols", AttributeType.STRING) + .add("urls", AttributeType.STRING) + .add("mentions", AttributeType.STRING) + .add("user_id", AttributeType.STRING) + .add("user_created_at", AttributeType.TIMESTAMP) + .add("user_name", AttributeType.STRING) + .add("user_display_name", AttributeType.STRING) + .add("user_lang", AttributeType.STRING) + .add("user_description", AttributeType.STRING) + .add("user_followers_count", AttributeType.LONG) + .add("user_following_count", AttributeType.LONG) + .add("user_tweet_count", AttributeType.LONG) + .add("user_listed_count", AttributeType.LONG) + .add("user_location", AttributeType.STRING) + .add("user_url", AttributeType.STRING) + .add("user_profile_image_url", AttributeType.STRING) + .add("user_pinned_tweet_id", AttributeType.STRING) + .add("user_protected", AttributeType.BOOLEAN) + .add("user_verified", AttributeType.BOOLEAN) } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/fetcher/URLFetcherOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/fetcher/URLFetcherOpDesc.scala index 49f5028d718..054af768565 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/fetcher/URLFetcherOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/fetcher/URLFetcherOpDesc.scala @@ -28,17 +28,11 @@ class URLFetcherOpDesc extends SourceOperatorDescriptor { var decodingMethod: DecodingMethod = _ override def sourceSchema(): Schema = { - Schema - .builder() + Schema() .add( "URL content", - if (decodingMethod == DecodingMethod.UTF_8) { - AttributeType.STRING - } else { - AttributeType.ANY - } + if (decodingMethod == DecodingMethod.UTF_8) AttributeType.STRING else AttributeType.ANY ) - .build() } override def getPhysicalOp( diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/FileScanSourceOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/FileScanSourceOpDesc.scala index 90c65c87eb9..437d2f2bbe3 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/FileScanSourceOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/FileScanSourceOpDesc.scala @@ -7,7 +7,7 @@ import com.kjetland.jackson.jsonSchema.annotations.{ JsonSchemaTitle } import edu.uci.ics.amber.core.executor.OpExecWithClassName -import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} +import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} import edu.uci.ics.amber.core.workflow.{PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.operator.metadata.annotations.HideAnnotation @@ -66,8 +66,10 @@ class FileScanSourceOpDesc extends ScanSourceOpDesc with TextSourceOpDesc { } override def sourceSchema(): Schema = { - val builder = Schema.builder() - if (outputFileName) builder.add(new Attribute("filename", AttributeType.STRING)) - builder.add(new Attribute(attributeName, attributeType.getType)).build() + var schema = Schema() + if (outputFileName) { + schema = schema.add("filename", AttributeType.STRING) + } + schema.add(attributeName, attributeType.getType) } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/csv/CSVScanSourceOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/csv/CSVScanSourceOpDesc.scala index cd2fdda4bdf..f689611c5f8 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/csv/CSVScanSourceOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/csv/CSVScanSourceOpDesc.scala @@ -6,7 +6,7 @@ import com.univocity.parsers.csv.{CsvFormat, CsvParser, CsvParserSettings} import edu.uci.ics.amber.core.executor.OpExecWithClassName import edu.uci.ics.amber.core.storage.DocumentFactory import edu.uci.ics.amber.core.tuple.AttributeTypeUtils.inferSchemaFromRows -import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} +import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} import edu.uci.ics.amber.core.workflow.{PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.operator.source.scan.ScanSourceOpDesc import edu.uci.ics.amber.util.JSONUtils.objectMapper @@ -94,10 +94,9 @@ class CSVScanSourceOpDesc extends ScanSourceOpDesc { if (hasHeader) parser.getContext.headers() else (1 to attributeTypeList.length).map(i => "column-" + i).toArray - Schema - .builder() - .add(header.indices.map(i => new Attribute(header(i), attributeTypeList(i)))) - .build() + header.indices.foldLeft(Schema()) { (schema, i) => + schema.add(header(i), attributeTypeList(i)) + } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/csv/ParallelCSVScanSourceOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/csv/ParallelCSVScanSourceOpDesc.scala index 4d4202da703..c5bffbf2080 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/csv/ParallelCSVScanSourceOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/csv/ParallelCSVScanSourceOpDesc.scala @@ -86,18 +86,12 @@ class ParallelCSVScanSourceOpDesc extends ScanSourceOpDesc { reader.close() // build schema based on inferred AttributeTypes - Schema - .builder() - .add( - firstRow.indices - .map((i: Int) => - new Attribute( - if (hasHeader) firstRow.apply(i) else "column-" + (i + 1), - attributeTypeList.apply(i) - ) - ) + Schema().add(firstRow.indices.map { i => + new Attribute( + if (hasHeader) firstRow(i) else s"column-${i + 1}", + attributeTypeList(i) ) - .build() + }) } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/csvOld/CSVOldScanSourceOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/csvOld/CSVOldScanSourceOpDesc.scala index 9ea25e13147..f4a3c427fc7 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/csvOld/CSVOldScanSourceOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/csvOld/CSVOldScanSourceOpDesc.scala @@ -84,18 +84,12 @@ class CSVOldScanSourceOpDesc extends ScanSourceOpDesc { reader.close() // build schema based on inferred AttributeTypes - Schema - .builder() - .add( - firstRow.indices - .map((i: Int) => - new Attribute( - if (hasHeader) firstRow.apply(i) else "column-" + (i + 1), - attributeTypeList.apply(i) - ) - ) + Schema().add(firstRow.indices.map { i => + new Attribute( + if (hasHeader) firstRow(i) else s"column-${i + 1}", + attributeTypeList(i) ) - .build() + }) } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/json/JSONLScanSourceOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/json/JSONLScanSourceOpDesc.scala index 9a9deee9bbc..f0d7eb0c789 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/json/JSONLScanSourceOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/json/JSONLScanSourceOpDesc.scala @@ -99,13 +99,9 @@ class JSONLScanSourceOpDesc extends ScanSourceOpDesc { result.toArray })) - Schema - .builder() - .add( - sortedFieldNames.indices - .map(i => new Attribute(sortedFieldNames(i), attributeTypes(i))) - ) - .build() + Schema().add(sortedFieldNames.indices.map { i => + new Attribute(sortedFieldNames(i), attributeTypes(i)) + }) } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/text/TextInputSourceOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/text/TextInputSourceOpDesc.scala index bdb59fff827..597424a068b 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/text/TextInputSourceOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/scan/text/TextInputSourceOpDesc.scala @@ -3,7 +3,7 @@ package edu.uci.ics.amber.operator.source.scan.text import com.fasterxml.jackson.annotation.JsonProperty import com.kjetland.jackson.jsonSchema.annotations.{JsonSchemaInject, JsonSchemaTitle} import edu.uci.ics.amber.core.executor.OpExecWithClassName -import edu.uci.ics.amber.core.tuple.{Attribute, Schema} +import edu.uci.ics.amber.core.tuple.Schema import edu.uci.ics.amber.core.workflow.{PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.metadata.annotations.UIWidget @@ -39,10 +39,7 @@ class TextInputSourceOpDesc extends SourceOperatorDescriptor with TextSourceOpDe ) override def sourceSchema(): Schema = - Schema - .builder() - .add(new Attribute(attributeName, attributeType.getType)) - .build() + Schema().add(attributeName, attributeType.getType) override def operatorInfo: OperatorInfo = OperatorInfo( diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/sql/SQLSourceOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/sql/SQLSourceOpDesc.scala index 77113ff4660..c688e8e8302 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/sql/SQLSourceOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/sql/SQLSourceOpDesc.scala @@ -127,52 +127,55 @@ abstract class SQLSourceOpDesc extends SourceOperatorDescriptor { } updatePort() - val schemaBuilder = Schema.builder() try { + val attributes = scala.collection.mutable.ListBuffer[Attribute]() val connection = establishConn connection.setReadOnly(true) val databaseMetaData = connection.getMetaData val columns = databaseMetaData.getColumns(null, null, this.table, null) - while ({ - columns.next - }) { + while (columns.next()) { val columnName = columns.getString("COLUMN_NAME") val datatype = columns.getInt("DATA_TYPE") - datatype match { + + // Map JDBC data types to AttributeType + val attributeType = datatype match { case Types.TINYINT | // -6 Types.TINYINT Types.SMALLINT | // 5 Types.SMALLINT Types.INTEGER => // 4 Types.INTEGER - schemaBuilder.add(new Attribute(columnName, AttributeType.INTEGER)) + AttributeType.INTEGER case Types.FLOAT | // 6 Types.FLOAT Types.REAL | // 7 Types.REAL Types.DOUBLE | // 8 Types.DOUBLE Types.NUMERIC => // 3 Types.NUMERIC - schemaBuilder.add(new Attribute(columnName, AttributeType.DOUBLE)) + AttributeType.DOUBLE case Types.BIT | // -7 Types.BIT Types.BOOLEAN => // 16 Types.BOOLEAN - schemaBuilder.add(new Attribute(columnName, AttributeType.BOOLEAN)) - case Types.BINARY => //-2 Types.BINARY - schemaBuilder.add(new Attribute(columnName, AttributeType.BINARY)) - case Types.DATE | //91 Types.DATE - Types.TIME | //92 Types.TIME - Types.LONGVARCHAR | //-1 Types.LONGVARCHAR - Types.CHAR | //1 Types.CHAR - Types.VARCHAR | //12 Types.VARCHAR - Types.NULL | //0 Types.NULL - Types.OTHER => //1111 Types.OTHER - schemaBuilder.add(new Attribute(columnName, AttributeType.STRING)) - case Types.BIGINT => //-5 Types.BIGINT - schemaBuilder.add(new Attribute(columnName, AttributeType.LONG)) + AttributeType.BOOLEAN + case Types.BINARY => // -2 Types.BINARY + AttributeType.BINARY + case Types.DATE | // 91 Types.DATE + Types.TIME | // 92 Types.TIME + Types.LONGVARCHAR | // -1 Types.LONGVARCHAR + Types.CHAR | // 1 Types.CHAR + Types.VARCHAR | // 12 Types.VARCHAR + Types.NULL | // 0 Types.NULL + Types.OTHER => // 1111 Types.OTHER + AttributeType.STRING + case Types.BIGINT => // -5 Types.BIGINT + AttributeType.LONG case Types.TIMESTAMP => // 93 Types.TIMESTAMP - schemaBuilder.add(new Attribute(columnName, AttributeType.TIMESTAMP)) + AttributeType.TIMESTAMP case _ => throw new RuntimeException( this.getClass.getSimpleName + ": unknown data type: " + datatype ) } + + // Add the attribute to the list + attributes += new Attribute(columnName, attributeType) } connection.close() - schemaBuilder.build() + Schema(attributes.toList) } catch { case e @ (_: SQLException | _: ClassCastException) => throw new RuntimeException( diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/sql/asterixdb/AsterixDBSourceOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/sql/asterixdb/AsterixDBSourceOpDesc.scala index 6f688ae8e68..ccccaa720a4 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/sql/asterixdb/AsterixDBSourceOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/source/sql/asterixdb/AsterixDBSourceOpDesc.scala @@ -127,23 +127,22 @@ class AsterixDBSourceOpDesc extends SQLSourceOpDesc { updatePort() - val sb: Schema.Builder = Schema.builder() - - // query dataset's Datatype from Metadata.`Datatype` + // Query dataset's Datatype from Metadata.`Datatype` val datasetDataType = queryAsterixDB( host, port, - "SELECT DatatypeName FROM Metadata.`Dataset` ds where ds.`DatasetName`='" + table + "';", + s"SELECT DatatypeName FROM Metadata.`Dataset` ds where ds.`DatasetName`='$table';", format = "JSON" ).get.next().asInstanceOf[JSONObject].getString("DatatypeName") - // query field types from Metadata.`Datatype` + // Query field types from Metadata.`Datatype` val fields = fetchDataTypeFields(datasetDataType, "", host, port) - for (key <- fields.keys.toList.sorted) { - sb.add(new Attribute(key, attributeTypeFromAsterixDBType(fields(key)))) + // Collect attributes by sorting field names and mapping them to Attribute instances + val attributes = fields.keys.toList.sorted.map { key => + new Attribute(key, attributeTypeFromAsterixDBType(fields(key))) } - sb.build() + Schema(attributes) } private def attributeTypeFromAsterixDBType(inputType: String): AttributeType = diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/java/JavaUDFOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/java/JavaUDFOpDesc.scala index fd38d176ae1..f5b3b529589 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/java/JavaUDFOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/java/JavaUDFOpDesc.scala @@ -72,21 +72,22 @@ class JavaUDFOpDesc extends LogicalOp { val propagateSchema = (inputSchemas: Map[PortIdentity, Schema]) => { val inputSchema = inputSchemas(operatorInfo.inputPorts.head.id) - val outputSchemaBuilder = Schema.builder() - // keep the same schema from input - if (retainInputColumns) outputSchemaBuilder.add(inputSchema) - // for any javaUDFType, it can add custom output columns (attributes). - if (outputColumns != null) { - if (retainInputColumns) { // check if columns are duplicated + var outputSchema = if (retainInputColumns) inputSchema else Schema() + // For any javaUDFType, it can add custom output columns (attributes). + if (outputColumns != null) { + if (retainInputColumns) { + // Check if columns are duplicated for (column <- outputColumns) { if (inputSchema.containsAttribute(column.getName)) throw new RuntimeException("Column name " + column.getName + " already exists!") } } - outputSchemaBuilder.add(outputColumns).build() + // Add custom output columns + outputSchema = outputSchema.add(outputColumns) } - Map(operatorInfo.outputPorts.head.id -> outputSchemaBuilder.build()) + + Map(operatorInfo.outputPorts.head.id -> outputSchema) } if (workers > 1) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/DualInputPortsPythonUDFOpDescV2.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/DualInputPortsPythonUDFOpDescV2.scala index a4af16c5415..57f85a663c5 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/DualInputPortsPythonUDFOpDescV2.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/DualInputPortsPythonUDFOpDescV2.scala @@ -91,22 +91,24 @@ class DualInputPortsPythonUDFOpDescV2 extends LogicalOp { .withPropagateSchema( SchemaPropagationFunc(inputSchemas => { Preconditions.checkArgument(inputSchemas.size == 2) + val inputSchema = inputSchemas(operatorInfo.inputPorts(1).id) - val outputSchemaBuilder = Schema.builder() - // keep the same schema from input - if (retainInputColumns) outputSchemaBuilder.add(inputSchema) - // for any pythonUDFType, it can add custom output columns (attributes). - if (outputColumns != null) { - if (retainInputColumns) { // check if columns are duplicated + var outputSchema = if (retainInputColumns) inputSchema else Schema() + // For any pythonUDFType, add custom output columns (attributes). + if (outputColumns != null) { + if (retainInputColumns) { + // Check if columns are duplicated for (column <- outputColumns) { if (inputSchema.containsAttribute(column.getName)) - throw new RuntimeException("Column name " + column.getName + " already exists!") + throw new RuntimeException(s"Column name ${column.getName} already exists!") } } - outputSchemaBuilder.add(outputColumns).build() + // Add custom output columns + outputSchema = outputSchema.add(outputColumns) } - Map(operatorInfo.outputPorts.head.id -> outputSchemaBuilder.build()) + + Map(operatorInfo.outputPorts.head.id -> outputSchema) }) ) } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/PythonLambdaFunctionOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/PythonLambdaFunctionOpDesc.scala index 056326c8093..ff49ee70ff2 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/PythonLambdaFunctionOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/PythonLambdaFunctionOpDesc.scala @@ -16,31 +16,33 @@ class PythonLambdaFunctionOpDesc extends PythonOperatorDescriptor { ): Map[PortIdentity, Schema] = { Preconditions.checkArgument(inputSchemas.size == 1) Preconditions.checkArgument(lambdaAttributeUnits.nonEmpty) + val inputSchema = inputSchemas.values.head - val outputSchemaBuilder = Schema.builder() - // keep the same schema from input - outputSchemaBuilder.add(inputSchema) - // add new attributes + var outputSchema = inputSchema + + // Add new attributes for (unit <- lambdaAttributeUnits) { if (unit.attributeName.equalsIgnoreCase("Add New Column")) { - if (inputSchema.containsAttribute(unit.newAttributeName)) { + if (outputSchema.containsAttribute(unit.newAttributeName)) { throw new RuntimeException( - "Column name " + unit.newAttributeName + " already exists!" + s"Column name ${unit.newAttributeName} already exists!" ) } - if (unit.newAttributeName != null && unit.newAttributeName.nonEmpty) - outputSchemaBuilder.add(unit.newAttributeName, unit.attributeType) + if (unit.newAttributeName != null && unit.newAttributeName.nonEmpty) { + outputSchema = outputSchema.add(unit.newAttributeName, unit.attributeType) + } } } - var outputSchema = outputSchemaBuilder.build() - // type casting + + // Type casting for (unit <- lambdaAttributeUnits) { - if (!unit.attributeName.equalsIgnoreCase("Add New Column")) + if (!unit.attributeName.equalsIgnoreCase("Add New Column")) { outputSchema = AttributeTypeUtils.SchemaCasting(outputSchema, unit.attributeName, unit.attributeType) + } } - Map(operatorInfo.outputPorts.head.id -> outputSchema) + Map(operatorInfo.outputPorts.head.id -> outputSchema) } override def operatorInfo: OperatorInfo = diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/PythonTableReducerOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/PythonTableReducerOpDesc.scala index 0f6d1988de5..2f636a42421 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/PythonTableReducerOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/PythonTableReducerOpDesc.scala @@ -15,11 +15,11 @@ class PythonTableReducerOpDesc extends PythonOperatorDescriptor { inputSchemas: Map[PortIdentity, Schema] ): Map[PortIdentity, Schema] = { Preconditions.checkArgument(lambdaAttributeUnits.nonEmpty) - val outputSchemaBuilder = Schema.builder() - for (unit <- lambdaAttributeUnits) { - outputSchemaBuilder.add(unit.attributeName, unit.attributeType) + val outputSchema = lambdaAttributeUnits.foldLeft(Schema()) { (schema, unit) => + schema.add(unit.attributeName, unit.attributeType) } - Map(operatorInfo.outputPorts.head.id -> outputSchemaBuilder.build()) + + Map(operatorInfo.outputPorts.head.id -> outputSchema) } override def operatorInfo: OperatorInfo = diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/PythonUDFOpDescV2.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/PythonUDFOpDescV2.scala index 216284315cb..1c070636ebd 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/PythonUDFOpDescV2.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/PythonUDFOpDescV2.scala @@ -73,23 +73,24 @@ class PythonUDFOpDescV2 extends LogicalOp { } val propagateSchema = (inputSchemas: Map[PortIdentity, Schema]) => { - // Preconditions.checkArgument(schemas.length == 1) val inputSchema = inputSchemas(operatorInfo.inputPorts.head.id) - val outputSchemaBuilder = Schema.builder() - // keep the same schema from input - if (retainInputColumns) outputSchemaBuilder.add(inputSchema) - // for any pythonUDFType, it can add custom output columns (attributes). - if (outputColumns != null) { - if (retainInputColumns) { // check if columns are duplicated + var outputSchema = if (retainInputColumns) inputSchema else Schema() + // Add custom output columns if defined + if (outputColumns != null) { + if (retainInputColumns) { + // Check for duplicate column names for (column <- outputColumns) { - if (inputSchema.containsAttribute(column.getName)) - throw new RuntimeException("Column name " + column.getName + " already exists!") + if (inputSchema.containsAttribute(column.getName)) { + throw new RuntimeException(s"Column name ${column.getName} already exists!") + } } } - outputSchemaBuilder.add(outputColumns).build() + // Add output columns to the schema + outputSchema = outputSchema.add(outputColumns) } - Map(operatorInfo.outputPorts.head.id -> outputSchemaBuilder.build()) + + Map(operatorInfo.outputPorts.head.id -> outputSchema) } if (workers > 1) { diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/source/PythonUDFSourceOpDescV2.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/source/PythonUDFSourceOpDescV2.scala index a219ba2808a..ca45e0408d0 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/source/PythonUDFSourceOpDescV2.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/source/PythonUDFSourceOpDescV2.scala @@ -72,10 +72,10 @@ class PythonUDFSourceOpDescV2 extends SourceOperatorDescriptor { } override def sourceSchema(): Schema = { - val outputSchemaBuilder = Schema.builder() - if (columns.nonEmpty && columns != null) { - outputSchemaBuilder.add(columns) + if (columns != null && columns.nonEmpty) { + Schema().add(columns) + } else { + Schema() } - outputSchemaBuilder.build() } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/r/RUDFOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/r/RUDFOpDesc.scala index bc9d6ec1b5e..5e0815f506a 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/r/RUDFOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/r/RUDFOpDesc.scala @@ -73,21 +73,23 @@ class RUDFOpDesc extends LogicalOp { val propagateSchema = (inputSchemas: Map[PortIdentity, Schema]) => { val inputSchema = inputSchemas(operatorInfo.inputPorts.head.id) - val outputSchemaBuilder = Schema.builder() - // keep the same schema from input - if (retainInputColumns) outputSchemaBuilder.add(inputSchema) - // for any javaUDFType, it can add custom output columns (attributes). - if (outputColumns != null) { - if (retainInputColumns) { // check if columns are duplicated + var outputSchema = if (retainInputColumns) inputSchema else Schema() + // Add custom output columns if provided + if (outputColumns != null) { + if (retainInputColumns) { + // Check for duplicate column names for (column <- outputColumns) { - if (inputSchema.containsAttribute(column.getName)) - throw new RuntimeException("Column name " + column.getName + " already exists!") + if (inputSchema.containsAttribute(column.getName)) { + throw new RuntimeException(s"Column name ${column.getName} already exists!") + } } } - outputSchemaBuilder.add(outputColumns).build() + // Add output columns to the schema + outputSchema = outputSchema.add(outputColumns) } - Map(operatorInfo.outputPorts.head.id -> outputSchemaBuilder.build()) + + Map(operatorInfo.outputPorts.head.id -> outputSchema) } val r_operator_type = if (useTupleAPI) "r-tuple" else "r-table" diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/r/RUDFSourceOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/r/RUDFSourceOpDesc.scala index 19f65d42c0d..e84900f8ddf 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/r/RUDFSourceOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/r/RUDFSourceOpDesc.scala @@ -86,10 +86,10 @@ class RUDFSourceOpDesc extends SourceOperatorDescriptor { } override def sourceSchema(): Schema = { - val outputSchemaBuilder = Schema.builder() - if (columns.nonEmpty && columns != null) { - outputSchemaBuilder.add(columns) + if (columns != null && columns.nonEmpty) { + Schema().add(columns) + } else { + Schema() } - outputSchemaBuilder.build() } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/unneststring/UnnestStringOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/unneststring/UnnestStringOpDesc.scala index 2eb0fefa152..6db50ce069e 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/unneststring/UnnestStringOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/unneststring/UnnestStringOpDesc.scala @@ -2,7 +2,7 @@ package edu.uci.ics.amber.operator.unneststring import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import edu.uci.ics.amber.core.executor.OpExecWithClassName -import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} +import edu.uci.ics.amber.core.tuple.AttributeType import edu.uci.ics.amber.core.workflow.{PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.operator.flatmap.FlatMapOpDesc import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} @@ -53,14 +53,10 @@ class UnnestStringOpDesc extends FlatMapOpDesc { .withOutputPorts(operatorInfo.outputPorts) .withPropagateSchema( SchemaPropagationFunc(inputSchemas => { - val outputSchema = - if (resultAttribute == null || resultAttribute.trim.isEmpty) null - else - Schema - .builder() - .add(inputSchemas.values.head) - .add(resultAttribute, AttributeType.STRING) - .build() + val outputSchema = Option(resultAttribute) + .filter(_.trim.nonEmpty) + .map(attr => inputSchemas.values.head.add(attr, AttributeType.STRING)) + .getOrElse(throw new RuntimeException("Result attribute cannot be empty")) Map(operatorInfo.outputPorts.head.id -> outputSchema) }) ) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/DotPlot/DotPlotOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/DotPlot/DotPlotOpDesc.scala index ff082be7b3d..6a41a1d3e72 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/DotPlot/DotPlotOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/DotPlot/DotPlotOpDesc.scala @@ -2,7 +2,7 @@ package edu.uci.ics.amber.operator.visualization.DotPlot import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle -import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} +import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} import edu.uci.ics.amber.operator.PythonOperatorDescriptor import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} @@ -20,10 +20,9 @@ class DotPlotOpDesc extends PythonOperatorDescriptor { override def getOutputSchemas( inputSchemas: Map[PortIdentity, Schema] ): Map[PortIdentity, Schema] = { - val outputSchema = Schema - .builder() - .add(new Attribute("html-content", AttributeType.STRING)) - .build() + val outputSchema = Schema() + .add("html-content", AttributeType.STRING) + Map(operatorInfo.outputPorts.head.id -> outputSchema) Map(operatorInfo.outputPorts.head.id -> outputSchema) } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/IcicleChart/IcicleChartOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/IcicleChart/IcicleChartOpDesc.scala index 16e682b4163..2928d379c94 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/IcicleChart/IcicleChartOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/IcicleChart/IcicleChartOpDesc.scala @@ -2,7 +2,7 @@ package edu.uci.ics.amber.operator.visualization.IcicleChart import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.{JsonSchemaInject, JsonSchemaTitle} -import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} +import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} import edu.uci.ics.amber.operator.PythonOperatorDescriptor import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName @@ -37,10 +37,9 @@ class IcicleChartOpDesc extends PythonOperatorDescriptor { override def getOutputSchemas( inputSchemas: Map[PortIdentity, Schema] ): Map[PortIdentity, Schema] = { - val outputSchema = Schema - .builder() - .add(new Attribute("html-content", AttributeType.STRING)) - .build() + val outputSchema = Schema() + .add("html-content", AttributeType.STRING) + Map(operatorInfo.outputPorts.head.id -> outputSchema) Map(operatorInfo.outputPorts.head.id -> outputSchema) } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/ImageViz/ImageVisualizerOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/ImageViz/ImageVisualizerOpDesc.scala index 5e85d1979b2..ccbf5f8f352 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/ImageViz/ImageVisualizerOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/ImageViz/ImageVisualizerOpDesc.scala @@ -2,7 +2,7 @@ package edu.uci.ics.amber.operator.visualization.ImageViz import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle -import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} +import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} @@ -19,10 +19,9 @@ class ImageVisualizerOpDesc extends PythonOperatorDescriptor { override def getOutputSchemas( inputSchemas: Map[PortIdentity, Schema] ): Map[PortIdentity, Schema] = { - val outputSchema = Schema - .builder() - .add(new Attribute("html-content", AttributeType.STRING)) - .build() + val outputSchema = Schema() + .add("html-content", AttributeType.STRING) + Map(operatorInfo.outputPorts.head.id -> outputSchema) Map(operatorInfo.outputPorts.head.id -> outputSchema) } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/ScatterMatrixChart/ScatterMatrixChartOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/ScatterMatrixChart/ScatterMatrixChartOpDesc.scala index 4b6a366d7c4..19a113a7c0a 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/ScatterMatrixChart/ScatterMatrixChartOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/ScatterMatrixChart/ScatterMatrixChartOpDesc.scala @@ -2,7 +2,7 @@ package edu.uci.ics.amber.operator.visualization.ScatterMatrixChart import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.{JsonSchemaInject, JsonSchemaTitle} -import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} +import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.metadata.annotations.{ @@ -37,10 +37,9 @@ class ScatterMatrixChartOpDesc extends PythonOperatorDescriptor { override def getOutputSchemas( inputSchemas: Map[PortIdentity, Schema] ): Map[PortIdentity, Schema] = { - val outputSchema = Schema - .builder() - .add(new Attribute("html-content", AttributeType.STRING)) - .build() + val outputSchema = Schema() + .add("html-content", AttributeType.STRING) + Map(operatorInfo.outputPorts.head.id -> outputSchema) Map(operatorInfo.outputPorts.head.id -> outputSchema) } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/barChart/BarChartOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/barChart/BarChartOpDesc.scala index c3924b3275d..258cc8b03dd 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/barChart/BarChartOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/barChart/BarChartOpDesc.scala @@ -2,7 +2,7 @@ package edu.uci.ics.amber.operator.visualization.barChart import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.{JsonSchemaInject, JsonSchemaTitle} -import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} +import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} import edu.uci.ics.amber.operator.PythonOperatorDescriptor @@ -53,10 +53,9 @@ class BarChartOpDesc extends PythonOperatorDescriptor { override def getOutputSchemas( inputSchemas: Map[PortIdentity, Schema] ): Map[PortIdentity, Schema] = { - val outputSchema = Schema - .builder() - .add(new Attribute("html-content", AttributeType.STRING)) - .build() + val outputSchema = Schema() + .add("html-content", AttributeType.STRING) + Map(operatorInfo.outputPorts.head.id -> outputSchema) Map(operatorInfo.outputPorts.head.id -> outputSchema) } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/boxPlot/BoxPlotOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/boxPlot/BoxPlotOpDesc.scala index 5df97e865a1..ca55d3b7bd5 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/boxPlot/BoxPlotOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/boxPlot/BoxPlotOpDesc.scala @@ -2,7 +2,7 @@ package edu.uci.ics.amber.operator.visualization.boxPlot import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.{JsonSchemaInject, JsonSchemaTitle} -import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} +import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} @@ -40,10 +40,9 @@ class BoxPlotOpDesc extends PythonOperatorDescriptor { override def getOutputSchemas( inputSchemas: Map[PortIdentity, Schema] ): Map[PortIdentity, Schema] = { - val outputSchema = Schema - .builder() - .add(new Attribute("html-content", AttributeType.STRING)) - .build() + val outputSchema = Schema() + .add("html-content", AttributeType.STRING) + Map(operatorInfo.outputPorts.head.id -> outputSchema) Map(operatorInfo.outputPorts.head.id -> outputSchema) } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/bubbleChart/BubbleChartOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/bubbleChart/BubbleChartOpDesc.scala index 3a4db9d8e91..fb0dd91e6c0 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/bubbleChart/BubbleChartOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/bubbleChart/BubbleChartOpDesc.scala @@ -2,7 +2,7 @@ package edu.uci.ics.amber.operator.visualization.bubbleChart import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle -import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} +import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} import edu.uci.ics.amber.operator.PythonOperatorDescriptor import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} @@ -46,10 +46,9 @@ class BubbleChartOpDesc extends PythonOperatorDescriptor { override def getOutputSchemas( inputSchemas: Map[PortIdentity, Schema] ): Map[PortIdentity, Schema] = { - val outputSchema = Schema - .builder() - .add(new Attribute("html-content", AttributeType.STRING)) - .build() + val outputSchema = Schema() + .add("html-content", AttributeType.STRING) + Map(operatorInfo.outputPorts.head.id -> outputSchema) Map(operatorInfo.outputPorts.head.id -> outputSchema) } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/candlestickChart/CandlestickChartOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/candlestickChart/CandlestickChartOpDesc.scala index 80ee1ff31e1..15e620d8a97 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/candlestickChart/CandlestickChartOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/candlestickChart/CandlestickChartOpDesc.scala @@ -2,7 +2,7 @@ package edu.uci.ics.amber.operator.visualization.candlestickChart import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle -import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} +import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} import edu.uci.ics.amber.operator.PythonOperatorDescriptor import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName @@ -44,10 +44,9 @@ class CandlestickChartOpDesc extends PythonOperatorDescriptor { override def getOutputSchemas( inputSchemas: Map[PortIdentity, Schema] ): Map[PortIdentity, Schema] = { - val outputSchema = Schema - .builder() - .add(new Attribute("html-content", AttributeType.STRING)) - .build() + val outputSchema = Schema() + .add("html-content", AttributeType.STRING) + Map(operatorInfo.outputPorts.head.id -> outputSchema) Map(operatorInfo.outputPorts.head.id -> outputSchema) } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/continuousErrorBands/ContinuousErrorBandsOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/continuousErrorBands/ContinuousErrorBandsOpDesc.scala index 78d818cc161..ac74425eb62 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/continuousErrorBands/ContinuousErrorBandsOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/continuousErrorBands/ContinuousErrorBandsOpDesc.scala @@ -2,7 +2,7 @@ package edu.uci.ics.amber.operator.visualization.continuousErrorBands import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle -import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} +import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} import edu.uci.ics.amber.operator.PythonOperatorDescriptor import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode @@ -28,10 +28,9 @@ class ContinuousErrorBandsOpDesc extends PythonOperatorDescriptor { override def getOutputSchemas( inputSchemas: Map[PortIdentity, Schema] ): Map[PortIdentity, Schema] = { - val outputSchema = Schema - .builder() - .add(new Attribute("html-content", AttributeType.STRING)) - .build() + val outputSchema = Schema() + .add("html-content", AttributeType.STRING) + Map(operatorInfo.outputPorts.head.id -> outputSchema) Map(operatorInfo.outputPorts.head.id -> outputSchema) } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/contourPlot/ContourPlotOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/contourPlot/ContourPlotOpDesc.scala index 0a132c2c996..721da590564 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/contourPlot/ContourPlotOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/contourPlot/ContourPlotOpDesc.scala @@ -2,7 +2,7 @@ package edu.uci.ics.amber.operator.visualization.contourPlot import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle -import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} +import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} import edu.uci.ics.amber.operator.PythonOperatorDescriptor import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName @@ -49,10 +49,9 @@ class ContourPlotOpDesc extends PythonOperatorDescriptor { override def getOutputSchemas( inputSchemas: Map[PortIdentity, Schema] ): Map[PortIdentity, Schema] = { - val outputSchema = Schema - .builder() - .add(new Attribute("html-content", AttributeType.STRING)) - .build() + val outputSchema = Schema() + .add("html-content", AttributeType.STRING) + Map(operatorInfo.outputPorts.head.id -> outputSchema) Map(operatorInfo.outputPorts.head.id -> outputSchema) } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/dumbbellPlot/DumbbellPlotOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/dumbbellPlot/DumbbellPlotOpDesc.scala index bac0482bf8a..433f578fd6e 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/dumbbellPlot/DumbbellPlotOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/dumbbellPlot/DumbbellPlotOpDesc.scala @@ -2,7 +2,7 @@ package edu.uci.ics.amber.operator.visualization.dumbbellPlot import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.{JsonSchemaInject, JsonSchemaTitle} -import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} +import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} @@ -62,10 +62,9 @@ class DumbbellPlotOpDesc extends PythonOperatorDescriptor { override def getOutputSchemas( inputSchemas: Map[PortIdentity, Schema] ): Map[PortIdentity, Schema] = { - val outputSchema = Schema - .builder() - .add(new Attribute("html-content", AttributeType.STRING)) - .build() + val outputSchema = Schema() + .add("html-content", AttributeType.STRING) + Map(operatorInfo.outputPorts.head.id -> outputSchema) Map(operatorInfo.outputPorts.head.id -> outputSchema) } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/figureFactoryTable/FigureFactoryTableOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/figureFactoryTable/FigureFactoryTableOpDesc.scala index 32c250b55dd..0f2fe00eeed 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/figureFactoryTable/FigureFactoryTableOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/figureFactoryTable/FigureFactoryTableOpDesc.scala @@ -2,7 +2,7 @@ package edu.uci.ics.amber.operator.visualization.figureFactoryTable import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle -import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} +import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} import edu.uci.ics.amber.operator.PythonOperatorDescriptor import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode @@ -107,10 +107,9 @@ class FigureFactoryTableOpDesc extends PythonOperatorDescriptor { override def getOutputSchemas( inputSchemas: Map[PortIdentity, Schema] ): Map[PortIdentity, Schema] = { - val outputSchema = Schema - .builder() - .add(new Attribute("html-content", AttributeType.STRING)) - .build() + val outputSchema = Schema() + .add("html-content", AttributeType.STRING) + Map(operatorInfo.outputPorts.head.id -> outputSchema) Map(operatorInfo.outputPorts.head.id -> outputSchema) } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/filledAreaPlot/FilledAreaPlotOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/filledAreaPlot/FilledAreaPlotOpDesc.scala index 2e4e0691a08..fa7bc133aeb 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/filledAreaPlot/FilledAreaPlotOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/filledAreaPlot/FilledAreaPlotOpDesc.scala @@ -2,7 +2,7 @@ package edu.uci.ics.amber.operator.visualization.filledAreaPlot import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle -import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} +import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} import edu.uci.ics.amber.operator.PythonOperatorDescriptor import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} @@ -49,10 +49,9 @@ class FilledAreaPlotOpDesc extends PythonOperatorDescriptor { override def getOutputSchemas( inputSchemas: Map[PortIdentity, Schema] ): Map[PortIdentity, Schema] = { - val outputSchema = Schema - .builder() - .add(new Attribute("html-content", AttributeType.STRING)) - .build() + val outputSchema = Schema() + .add("html-content", AttributeType.STRING) + Map(operatorInfo.outputPorts.head.id -> outputSchema) Map(operatorInfo.outputPorts.head.id -> outputSchema) } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/funnelPlot/FunnelPlotOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/funnelPlot/FunnelPlotOpDesc.scala index a7e8075edff..3a82e95e702 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/funnelPlot/FunnelPlotOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/funnelPlot/FunnelPlotOpDesc.scala @@ -2,7 +2,7 @@ package edu.uci.ics.amber.operator.visualization.funnelPlot import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.{JsonSchemaInject, JsonSchemaTitle} -import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} +import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} import edu.uci.ics.amber.operator.PythonOperatorDescriptor import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} @@ -38,10 +38,9 @@ class FunnelPlotOpDesc extends PythonOperatorDescriptor { override def getOutputSchemas( inputSchemas: Map[PortIdentity, Schema] ): Map[PortIdentity, Schema] = { - val outputSchema = Schema - .builder() - .add(new Attribute("html-content", AttributeType.STRING)) - .build() + val outputSchema = Schema() + .add("html-content", AttributeType.STRING) + Map(operatorInfo.outputPorts.head.id -> outputSchema) Map(operatorInfo.outputPorts.head.id -> outputSchema) } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/ganttChart/GanttChartOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/ganttChart/GanttChartOpDesc.scala index 382035b3d64..0cafc4e2143 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/ganttChart/GanttChartOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/ganttChart/GanttChartOpDesc.scala @@ -2,7 +2,7 @@ package edu.uci.ics.amber.operator.visualization.ganttChart import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.{JsonSchemaInject, JsonSchemaTitle} -import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} +import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} import edu.uci.ics.amber.operator.PythonOperatorDescriptor import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} @@ -56,10 +56,9 @@ class GanttChartOpDesc extends PythonOperatorDescriptor { override def getOutputSchemas( inputSchemas: Map[PortIdentity, Schema] ): Map[PortIdentity, Schema] = { - val outputSchema = Schema - .builder() - .add(new Attribute("html-content", AttributeType.STRING)) - .build() + val outputSchema = Schema() + .add("html-content", AttributeType.STRING) + Map(operatorInfo.outputPorts.head.id -> outputSchema) Map(operatorInfo.outputPorts.head.id -> outputSchema) } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/heatMap/HeatMapOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/heatMap/HeatMapOpDesc.scala index 3b623fbccc3..29f1dd8af23 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/heatMap/HeatMapOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/heatMap/HeatMapOpDesc.scala @@ -2,7 +2,7 @@ package edu.uci.ics.amber.operator.visualization.heatMap import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle -import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} +import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} import edu.uci.ics.amber.operator.PythonOperatorDescriptor import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} @@ -31,10 +31,9 @@ class HeatMapOpDesc extends PythonOperatorDescriptor { override def getOutputSchemas( inputSchemas: Map[PortIdentity, Schema] ): Map[PortIdentity, Schema] = { - val outputSchema = Schema - .builder() - .add(new Attribute("html-content", AttributeType.STRING)) - .build() + val outputSchema = Schema() + .add("html-content", AttributeType.STRING) + Map(operatorInfo.outputPorts.head.id -> outputSchema) Map(operatorInfo.outputPorts.head.id -> outputSchema) } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/hierarchychart/HierarchyChartOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/hierarchychart/HierarchyChartOpDesc.scala index 3e09d51484c..20c06c3a677 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/hierarchychart/HierarchyChartOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/hierarchychart/HierarchyChartOpDesc.scala @@ -2,7 +2,7 @@ package edu.uci.ics.amber.operator.visualization.hierarchychart import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.{JsonSchemaInject, JsonSchemaTitle} -import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} +import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode @@ -40,10 +40,9 @@ class HierarchyChartOpDesc extends PythonOperatorDescriptor { override def getOutputSchemas( inputSchemas: Map[PortIdentity, Schema] ): Map[PortIdentity, Schema] = { - val outputSchema = Schema - .builder() - .add(new Attribute("html-content", AttributeType.STRING)) - .build() + val outputSchema = Schema() + .add("html-content", AttributeType.STRING) + Map(operatorInfo.outputPorts.head.id -> outputSchema) Map(operatorInfo.outputPorts.head.id -> outputSchema) } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/histogram/HistogramChartOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/histogram/HistogramChartOpDesc.scala index 829f5355224..d63944e319e 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/histogram/HistogramChartOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/histogram/HistogramChartOpDesc.scala @@ -2,7 +2,7 @@ package edu.uci.ics.amber.operator.visualization.histogram import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle -import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} +import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} @@ -97,10 +97,9 @@ class HistogramChartOpDesc extends PythonOperatorDescriptor { override def getOutputSchemas( inputSchemas: Map[PortIdentity, Schema] ): Map[PortIdentity, Schema] = { - val outputSchema = Schema - .builder() - .add(new Attribute("html-content", AttributeType.STRING)) - .build() + val outputSchema = Schema() + .add("html-content", AttributeType.STRING) + Map(operatorInfo.outputPorts.head.id -> outputSchema) Map(operatorInfo.outputPorts.head.id -> outputSchema) } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/htmlviz/HtmlVizOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/htmlviz/HtmlVizOpDesc.scala index 5d84d7e548b..1abb6fa8a28 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/htmlviz/HtmlVizOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/htmlviz/HtmlVizOpDesc.scala @@ -3,7 +3,7 @@ package edu.uci.ics.amber.operator.visualization.htmlviz import com.fasterxml.jackson.annotation.JsonProperty import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle import edu.uci.ics.amber.core.executor.OpExecWithClassName -import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} +import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.operator.LogicalOp import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName @@ -39,10 +39,7 @@ class HtmlVizOpDesc extends LogicalOp { .withOutputPorts(operatorInfo.outputPorts) .withPropagateSchema( SchemaPropagationFunc(inputSchemas => { - val outputSchema = Schema - .builder() - .add(new Attribute("html-content", AttributeType.STRING)) - .build() + val outputSchema = Schema().add("html-content", AttributeType.STRING) Map(operatorInfo.outputPorts.head.id -> outputSchema) }) ) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/lineChart/LineChartOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/lineChart/LineChartOpDesc.scala index 69eb7f83d12..37f8780fce9 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/lineChart/LineChartOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/lineChart/LineChartOpDesc.scala @@ -2,7 +2,7 @@ package edu.uci.ics.amber.operator.visualization.lineChart import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle -import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} +import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} import edu.uci.ics.amber.operator.PythonOperatorDescriptor import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode @@ -29,10 +29,9 @@ class LineChartOpDesc extends PythonOperatorDescriptor { override def getOutputSchemas( inputSchemas: Map[PortIdentity, Schema] ): Map[PortIdentity, Schema] = { - val outputSchema = Schema - .builder() - .add(new Attribute("html-content", AttributeType.STRING)) - .build() + val outputSchema = Schema() + .add("html-content", AttributeType.STRING) + Map(operatorInfo.outputPorts.head.id -> outputSchema) Map(operatorInfo.outputPorts.head.id -> outputSchema) } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/pieChart/PieChartOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/pieChart/PieChartOpDesc.scala index 5ff0bfa88ae..400ce53f1e6 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/pieChart/PieChartOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/pieChart/PieChartOpDesc.scala @@ -2,7 +2,7 @@ package edu.uci.ics.amber.operator.visualization.pieChart import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.{JsonSchemaInject, JsonSchemaTitle} -import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} +import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} import edu.uci.ics.amber.operator.PythonOperatorDescriptor import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} @@ -36,10 +36,9 @@ class PieChartOpDesc extends PythonOperatorDescriptor { override def getOutputSchemas( inputSchemas: Map[PortIdentity, Schema] ): Map[PortIdentity, Schema] = { - val outputSchema = Schema - .builder() - .add(new Attribute("html-content", AttributeType.STRING)) - .build() + val outputSchema = Schema() + .add("html-content", AttributeType.STRING) + Map(operatorInfo.outputPorts.head.id -> outputSchema) Map(operatorInfo.outputPorts.head.id -> outputSchema) } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/quiverPlot/QuiverPlotOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/quiverPlot/QuiverPlotOpDesc.scala index 58f8759594b..a6e52ac5917 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/quiverPlot/QuiverPlotOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/quiverPlot/QuiverPlotOpDesc.scala @@ -2,7 +2,7 @@ package edu.uci.ics.amber.operator.visualization.quiverPlot import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.{JsonSchemaInject, JsonSchemaTitle} -import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} +import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} import edu.uci.ics.amber.operator.PythonOperatorDescriptor import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} @@ -45,10 +45,9 @@ class QuiverPlotOpDesc extends PythonOperatorDescriptor { override def getOutputSchemas( inputSchemas: Map[PortIdentity, Schema] ): Map[PortIdentity, Schema] = { - val outputSchema = Schema - .builder() - .add(new Attribute("html-content", AttributeType.STRING)) - .build() + val outputSchema = Schema() + .add("html-content", AttributeType.STRING) + Map(operatorInfo.outputPorts.head.id -> outputSchema) Map(operatorInfo.outputPorts.head.id -> outputSchema) } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/sankeyDiagram/SankeyDiagramOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/sankeyDiagram/SankeyDiagramOpDesc.scala index ca8cff0cdcb..d468c64b6c8 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/sankeyDiagram/SankeyDiagramOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/sankeyDiagram/SankeyDiagramOpDesc.scala @@ -2,7 +2,7 @@ package edu.uci.ics.amber.operator.visualization.sankeyDiagram import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle -import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} +import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} import edu.uci.ics.amber.operator.PythonOperatorDescriptor import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName @@ -32,10 +32,9 @@ class SankeyDiagramOpDesc extends PythonOperatorDescriptor { override def getOutputSchemas( inputSchemas: Map[PortIdentity, Schema] ): Map[PortIdentity, Schema] = { - val outputSchema = Schema - .builder() - .add(new Attribute("html-content", AttributeType.STRING)) - .build() + val outputSchema = Schema() + .add("html-content", AttributeType.STRING) + Map(operatorInfo.outputPorts.head.id -> outputSchema) Map(operatorInfo.outputPorts.head.id -> outputSchema) } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/scatter3DChart/Scatter3dChartOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/scatter3DChart/Scatter3dChartOpDesc.scala index 2a42c2c84dc..5ce75d46e74 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/scatter3DChart/Scatter3dChartOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/scatter3DChart/Scatter3dChartOpDesc.scala @@ -2,7 +2,7 @@ package edu.uci.ics.amber.operator.visualization.scatter3DChart import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.{JsonSchemaInject, JsonSchemaTitle} -import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} +import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName @@ -37,10 +37,9 @@ class Scatter3dChartOpDesc extends PythonOperatorDescriptor { override def getOutputSchemas( inputSchemas: Map[PortIdentity, Schema] ): Map[PortIdentity, Schema] = { - val outputSchema = Schema - .builder() - .add(new Attribute("html-content", AttributeType.STRING)) - .build() + val outputSchema = Schema() + .add("html-content", AttributeType.STRING) + Map(operatorInfo.outputPorts.head.id -> outputSchema) Map(operatorInfo.outputPorts.head.id -> outputSchema) } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/scatterplot/ScatterplotOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/scatterplot/ScatterplotOpDesc.scala index d56a2c45d1d..1d6fa4a4068 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/scatterplot/ScatterplotOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/scatterplot/ScatterplotOpDesc.scala @@ -2,7 +2,7 @@ package edu.uci.ics.amber.operator.visualization.scatterplot import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.{JsonSchemaInject, JsonSchemaTitle} -import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} +import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} import edu.uci.ics.amber.operator.PythonOperatorDescriptor import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} @@ -63,10 +63,9 @@ class ScatterplotOpDesc extends PythonOperatorDescriptor { override def getOutputSchemas( inputSchemas: Map[PortIdentity, Schema] ): Map[PortIdentity, Schema] = { - val outputSchema = Schema - .builder() - .add(new Attribute("html-content", AttributeType.STRING)) - .build() + val outputSchema = Schema() + .add("html-content", AttributeType.STRING) + Map(operatorInfo.outputPorts.head.id -> outputSchema) Map(operatorInfo.outputPorts.head.id -> outputSchema) } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/tablesChart/TablesPlotOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/tablesChart/TablesPlotOpDesc.scala index 87d174d01e9..a8a30c2e7b9 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/tablesChart/TablesPlotOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/tablesChart/TablesPlotOpDesc.scala @@ -1,7 +1,7 @@ package edu.uci.ics.amber.operator.visualization.tablesChart import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} -import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} +import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} import edu.uci.ics.amber.operator.PythonOperatorDescriptor import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode @@ -83,10 +83,9 @@ class TablesPlotOpDesc extends PythonOperatorDescriptor { override def getOutputSchemas( inputSchemas: Map[PortIdentity, Schema] ): Map[PortIdentity, Schema] = { - val outputSchema = Schema - .builder() - .add(new Attribute("html-content", AttributeType.STRING)) - .build() + val outputSchema = Schema() + .add("html-content", AttributeType.STRING) + Map(operatorInfo.outputPorts.head.id -> outputSchema) Map(operatorInfo.outputPorts.head.id -> outputSchema) } } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/ternaryPlot/TernaryPlotOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/ternaryPlot/TernaryPlotOpDesc.scala index 2840ea421da..8d6d9483d25 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/ternaryPlot/TernaryPlotOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/ternaryPlot/TernaryPlotOpDesc.scala @@ -2,7 +2,7 @@ package edu.uci.ics.amber.operator.visualization.ternaryPlot import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle -import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} +import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} import edu.uci.ics.amber.operator.PythonOperatorDescriptor import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} @@ -60,10 +60,9 @@ class TernaryPlotOpDesc extends PythonOperatorDescriptor { override def getOutputSchemas( inputSchemas: Map[PortIdentity, Schema] ): Map[PortIdentity, Schema] = { - val outputSchema = Schema - .builder() - .add(new Attribute("html-content", AttributeType.STRING)) - .build() + val outputSchema = Schema() + .add("html-content", AttributeType.STRING) + Map(operatorInfo.outputPorts.head.id -> outputSchema) Map(operatorInfo.outputPorts.head.id -> outputSchema) } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/urlviz/UrlVizOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/urlviz/UrlVizOpDesc.scala index 90482deaaa5..f89154a3810 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/urlviz/UrlVizOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/urlviz/UrlVizOpDesc.scala @@ -3,7 +3,7 @@ package edu.uci.ics.amber.operator.visualization.urlviz import com.fasterxml.jackson.annotation.JsonProperty import com.kjetland.jackson.jsonSchema.annotations.{JsonSchemaInject, JsonSchemaTitle} import edu.uci.ics.amber.core.executor.OpExecWithClassName -import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} +import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PhysicalOp, SchemaPropagationFunc} import edu.uci.ics.amber.operator.LogicalOp import edu.uci.ics.amber.core.virtualidentity.{ExecutionIdentity, WorkflowIdentity} @@ -50,10 +50,7 @@ class UrlVizOpDesc extends LogicalOp { .withOutputPorts(operatorInfo.outputPorts) .withPropagateSchema( SchemaPropagationFunc(_ => { - val outputSchema = Schema - .builder() - .add(new Attribute("html-content", AttributeType.STRING)) - .build() + val outputSchema = Schema().add("html-content", AttributeType.STRING) Map(operatorInfo.outputPorts.head.id -> outputSchema) }) ) diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/waterfallChart/WaterfallChartOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/waterfallChart/WaterfallChartOpDesc.scala index 2ba19165765..8aa21b9e43a 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/waterfallChart/WaterfallChartOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/waterfallChart/WaterfallChartOpDesc.scala @@ -2,7 +2,7 @@ package edu.uci.ics.amber.operator.visualization.waterfallChart import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription} import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle -import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} +import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} import edu.uci.ics.amber.operator.PythonOperatorDescriptor import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName @@ -26,10 +26,9 @@ class WaterfallChartOpDesc extends PythonOperatorDescriptor { override def getOutputSchemas( inputSchemas: Map[PortIdentity, Schema] ): Map[PortIdentity, Schema] = { - val outputSchema = Schema - .builder() - .add(new Attribute("html-content", AttributeType.STRING)) - .build() + val outputSchema = Schema() + .add("html-content", AttributeType.STRING) + Map(operatorInfo.outputPorts.head.id -> outputSchema) Map(operatorInfo.outputPorts.head.id -> outputSchema) } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/wordCloud/WordCloudOpDesc.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/wordCloud/WordCloudOpDesc.scala index e6e2c408e48..d69013040f4 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/wordCloud/WordCloudOpDesc.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/visualization/wordCloud/WordCloudOpDesc.scala @@ -6,13 +6,13 @@ import com.kjetland.jackson.jsonSchema.annotations.{ JsonSchemaInt, JsonSchemaTitle } -import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema} +import edu.uci.ics.amber.core.tuple.{AttributeType, Schema} +import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode +import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} import edu.uci.ics.amber.operator.PythonOperatorDescriptor import edu.uci.ics.amber.operator.metadata.annotations.AutofillAttributeName import edu.uci.ics.amber.operator.metadata.{OperatorGroupConstants, OperatorInfo} import edu.uci.ics.amber.operator.visualization.ImageUtility -import edu.uci.ics.amber.core.workflow.OutputPort.OutputMode -import edu.uci.ics.amber.core.workflow.{InputPort, OutputPort, PortIdentity} class WordCloudOpDesc extends PythonOperatorDescriptor { @JsonProperty(required = true) @JsonSchemaTitle("Text column") @@ -27,10 +27,9 @@ class WordCloudOpDesc extends PythonOperatorDescriptor { override def getOutputSchemas( inputSchemas: Map[PortIdentity, Schema] ): Map[PortIdentity, Schema] = { - val outputSchema = Schema - .builder() - .add(new Attribute("html-content", AttributeType.STRING)) - .build() + val outputSchema = Schema() + .add("html-content", AttributeType.STRING) + Map(operatorInfo.outputPorts.head.id -> outputSchema) Map(operatorInfo.outputPorts.head.id -> outputSchema) } diff --git a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/cartesianProduct/CartesianProductOpExecSpec.scala b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/cartesianProduct/CartesianProductOpExecSpec.scala index 60725d53295..29d79438132 100644 --- a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/cartesianProduct/CartesianProductOpExecSpec.scala +++ b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/cartesianProduct/CartesianProductOpExecSpec.scala @@ -38,7 +38,7 @@ class CartesianProductOpExecSpec extends AnyFlatSpec with BeforeAndAfter { .map(num => new Attribute(base_name + (if (append_num) "#@" + num else ""), AttributeType.STRING) ) - Schema.builder().add(attrs).build() + Schema().add(attrs) } before { @@ -93,16 +93,10 @@ class CartesianProductOpExecSpec extends AnyFlatSpec with BeforeAndAfter { val numRightTuples: Int = 3 val duplicateAttribute: Attribute = new Attribute("left", AttributeType.STRING) - val leftSchema = Schema - .builder() - .add(generate_schema("left", numLeftSchemaAttributes - 1)) + val leftSchema = generate_schema("left", numLeftSchemaAttributes - 1) .add(duplicateAttribute) - .build() - val rightSchema = Schema - .builder() - .add(generate_schema("right", numRightSchemaAttributes - 1)) + val rightSchema = generate_schema("right", numRightSchemaAttributes - 1) .add(duplicateAttribute) - .build() val inputSchemas = Map(PortIdentity() -> leftSchema, PortIdentity(1) -> rightSchema) val outputSchema = opDesc.getExternalOutputSchemas(inputSchemas).values.head diff --git a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/dictionary/DictionaryMatcherOpExecSpec.scala b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/dictionary/DictionaryMatcherOpExecSpec.scala index 1d19700e071..2a4d6745523 100644 --- a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/dictionary/DictionaryMatcherOpExecSpec.scala +++ b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/dictionary/DictionaryMatcherOpExecSpec.scala @@ -7,12 +7,10 @@ import org.scalatest.BeforeAndAfter import org.scalatest.flatspec.AnyFlatSpec class DictionaryMatcherOpExecSpec extends AnyFlatSpec with BeforeAndAfter { - val tupleSchema: Schema = Schema - .builder() + val tupleSchema: Schema = Schema() .add(new Attribute("field1", AttributeType.STRING)) .add(new Attribute("field2", AttributeType.INTEGER)) .add(new Attribute("field3", AttributeType.BOOLEAN)) - .build() val tuple: Tuple = Tuple .builder(tupleSchema) diff --git a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/difference/DifferenceOpExecSpec.scala b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/difference/DifferenceOpExecSpec.scala index 37aa0e35004..6f1813c7319 100644 --- a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/difference/DifferenceOpExecSpec.scala +++ b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/difference/DifferenceOpExecSpec.scala @@ -8,14 +8,10 @@ class DifferenceOpExecSpec extends AnyFlatSpec with BeforeAndAfter { var input2: Int = 1 var opExec: DifferenceOpExec = _ var counter: Int = 0 - val schema: Schema = Schema - .builder() - .add( - new Attribute("field1", AttributeType.STRING), - new Attribute("field2", AttributeType.INTEGER), - new Attribute("field3", AttributeType.BOOLEAN) - ) - .build() + val schema: Schema = Schema() + .add(new Attribute("field1", AttributeType.STRING)) + .add(new Attribute("field2", AttributeType.INTEGER)) + .add(new Attribute("field3", AttributeType.BOOLEAN)) def tuple(): Tuple = { counter += 1 diff --git a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/distinct/DistinctOpExecSpec.scala b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/distinct/DistinctOpExecSpec.scala index 8f397a6dc49..865693f274e 100644 --- a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/distinct/DistinctOpExecSpec.scala +++ b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/distinct/DistinctOpExecSpec.scala @@ -4,12 +4,10 @@ import org.scalatest.BeforeAndAfter import org.scalatest.flatspec.AnyFlatSpec import edu.uci.ics.amber.core.tuple.{Attribute, AttributeType, Schema, Tuple, TupleLike} class DistinctOpExecSpec extends AnyFlatSpec with BeforeAndAfter { - val tupleSchema: Schema = Schema - .builder() + val tupleSchema: Schema = Schema() .add(new Attribute("field1", AttributeType.STRING)) .add(new Attribute("field2", AttributeType.INTEGER)) .add(new Attribute("field3", AttributeType.BOOLEAN)) - .build() val tuple: () => Tuple = () => Tuple diff --git a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/filter/SpecializedFilterOpExecSpec.scala b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/filter/SpecializedFilterOpExecSpec.scala index a17642c8286..47dc26e1cc7 100644 --- a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/filter/SpecializedFilterOpExecSpec.scala +++ b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/filter/SpecializedFilterOpExecSpec.scala @@ -14,19 +14,17 @@ class SpecializedFilterOpExecSpec extends AnyFlatSpec with BeforeAndAfter { .map(attributeType => Tuple .builder( - Schema.builder().add(new Attribute(attributeType.name(), attributeType)).build() + Schema().add(new Attribute(attributeType.name(), attributeType)) ) .add(new Attribute(attributeType.name(), attributeType), null) .build() ) - val tupleSchema: Schema = Schema - .builder() + val tupleSchema: Schema = Schema() .add(new Attribute("string", AttributeType.STRING)) .add(new Attribute("int", AttributeType.INTEGER)) .add(new Attribute("bool", AttributeType.BOOLEAN)) .add(new Attribute("long", AttributeType.LONG)) - .build() val allNullTuple: Tuple = Tuple .builder(tupleSchema) diff --git a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/hashJoin/HashJoinOpSpec.scala b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/hashJoin/HashJoinOpSpec.scala index c3cd8f6ecd3..c31d5eb25ab 100644 --- a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/hashJoin/HashJoinOpSpec.scala +++ b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/hashJoin/HashJoinOpSpec.scala @@ -23,11 +23,10 @@ class HashJoinOpSpec extends AnyFlatSpec with BeforeAndAfter { var opDesc: HashJoinOpDesc[String] = _ def getInternalHashTableSchema(buildInputSchema: Schema): Schema = { - Schema - .builder() + Schema() .add(HASH_JOIN_INTERNAL_KEY_NAME, AttributeType.ANY) .add(buildInputSchema) - .build() + } def tuple(name: String, n: Int = 1, i: Option[Int]): Tuple = { @@ -39,13 +38,10 @@ class HashJoinOpSpec extends AnyFlatSpec with BeforeAndAfter { } def schema(name: String, n: Int = 1): Schema = { - Schema - .builder() - .add( - new Attribute(name, AttributeType.STRING), - new Attribute(name + "_" + n, AttributeType.STRING) - ) - .build() + Schema() + .add(new Attribute(name, AttributeType.STRING)) + .add(new Attribute(name + "_" + n, AttributeType.STRING)) + } it should "work with basic two input streams with different buildAttributeName and probeAttributeName" in { diff --git a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/intersect/IntersectOpExecSpec.scala b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/intersect/IntersectOpExecSpec.scala index 03c10c310b6..2310b8c5ccf 100644 --- a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/intersect/IntersectOpExecSpec.scala +++ b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/intersect/IntersectOpExecSpec.scala @@ -11,14 +11,10 @@ class IntersectOpExecSpec extends AnyFlatSpec with BeforeAndAfter { var opExec: IntersectOpExec = _ var counter: Int = 0 - val tupleSchema: Schema = Schema - .builder() + val tupleSchema: Schema = Schema() .add(new Attribute("field1", AttributeType.STRING)) .add(new Attribute("field2", AttributeType.INTEGER)) - .add( - new Attribute("field3", AttributeType.BOOLEAN) - ) - .build() + .add(new Attribute("field3", AttributeType.BOOLEAN)) def physicalOpId(): PhysicalOpIdentity = { counter += 1 diff --git a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/intervalJoin/IntervalOpExecSpec.scala b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/intervalJoin/IntervalOpExecSpec.scala index e8c26d84a23..73090da01b8 100644 --- a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/intervalJoin/IntervalOpExecSpec.scala +++ b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/intervalJoin/IntervalOpExecSpec.scala @@ -62,13 +62,10 @@ class IntervalOpExecSpec extends AnyFlatSpec with BeforeAndAfter { } def schema(name: String, attributeType: AttributeType, n: Int = 1): Schema = { - Schema - .builder() - .add( - new Attribute(name, attributeType), - new Attribute(name + "_" + n, attributeType) - ) - .build() + Schema() + .add(new Attribute(name, attributeType)) + .add(new Attribute(name + "_" + n, attributeType)) + } def longTuple(name: String, n: Int = 1, i: Long): Tuple = { diff --git a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/keywordSearch/KeywordSearchOpExecSpec.scala b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/keywordSearch/KeywordSearchOpExecSpec.scala index c0e12804e7c..60bc0471532 100644 --- a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/keywordSearch/KeywordSearchOpExecSpec.scala +++ b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/keywordSearch/KeywordSearchOpExecSpec.scala @@ -9,10 +9,8 @@ class KeywordSearchOpExecSpec extends AnyFlatSpec with BeforeAndAfter { val inputPort: Int = 0 val opDesc: KeywordSearchOpDesc = new KeywordSearchOpDesc() - val schema: Schema = Schema - .builder() + val schema: Schema = Schema() .add(new Attribute("text", AttributeType.STRING)) - .build() def createTuple(text: String): Tuple = { Tuple diff --git a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/projection/ProjectionOpExecSpec.scala b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/projection/ProjectionOpExecSpec.scala index edd889734d7..bdd5a58f94d 100644 --- a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/projection/ProjectionOpExecSpec.scala +++ b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/projection/ProjectionOpExecSpec.scala @@ -5,12 +5,10 @@ import edu.uci.ics.amber.util.JSONUtils.objectMapper import org.scalatest.BeforeAndAfter import org.scalatest.flatspec.AnyFlatSpec class ProjectionOpExecSpec extends AnyFlatSpec with BeforeAndAfter { - val tupleSchema: Schema = Schema - .builder() + val tupleSchema: Schema = Schema() .add(new Attribute("field1", AttributeType.STRING)) .add(new Attribute("field2", AttributeType.INTEGER)) .add(new Attribute("field3", AttributeType.BOOLEAN)) - .build() val tuple: Tuple = Tuple .builder(tupleSchema) @@ -38,11 +36,9 @@ class ProjectionOpExecSpec extends AnyFlatSpec with BeforeAndAfter { new AttributeUnit("field2", "f2"), new AttributeUnit("field1", "f1") ) - val outputSchema = Schema - .builder() + val outputSchema = Schema() .add(new Attribute("f1", AttributeType.STRING)) .add(new Attribute("f2", AttributeType.INTEGER)) - .build() val projectionOpExec = new ProjectionOpExec(objectMapper.writeValueAsString(opDesc)) projectionOpExec.open() @@ -64,11 +60,9 @@ class ProjectionOpExecSpec extends AnyFlatSpec with BeforeAndAfter { new AttributeUnit("field3", "f3"), new AttributeUnit("field1", "f1") ) - val outputSchema = Schema - .builder() + val outputSchema = Schema() .add(new Attribute("f3", AttributeType.BOOLEAN)) .add(new Attribute("f1", AttributeType.STRING)) - .build() val projectionOpExec = new ProjectionOpExec(objectMapper.writeValueAsString(opDesc)) projectionOpExec.open() @@ -121,11 +115,9 @@ class ProjectionOpExecSpec extends AnyFlatSpec with BeforeAndAfter { new AttributeUnit("field2", "f2"), new AttributeUnit("field1", "") ) - val outputSchema = Schema - .builder() + val outputSchema = Schema() .add(new Attribute("field1", AttributeType.STRING)) .add(new Attribute("f2", AttributeType.INTEGER)) - .build() val projectionOpExec = new ProjectionOpExec(objectMapper.writeValueAsString(opDesc)) projectionOpExec.open() diff --git a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/sortPartitions/SortPartitionsOpExecSpec.scala b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/sortPartitions/SortPartitionsOpExecSpec.scala index aeab7443c1d..6a7966d9e0c 100644 --- a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/sortPartitions/SortPartitionsOpExecSpec.scala +++ b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/sortPartitions/SortPartitionsOpExecSpec.scala @@ -5,12 +5,10 @@ import edu.uci.ics.amber.util.JSONUtils.objectMapper import org.scalatest.BeforeAndAfter import org.scalatest.flatspec.AnyFlatSpec class SortPartitionsOpExecSpec extends AnyFlatSpec with BeforeAndAfter { - val tupleSchema: Schema = Schema - .builder() + val tupleSchema: Schema = Schema() .add(new Attribute("field1", AttributeType.STRING)) .add(new Attribute("field2", AttributeType.INTEGER)) .add(new Attribute("field3", AttributeType.BOOLEAN)) - .build() val tuple: Int => Tuple = i => Tuple diff --git a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/symmetricDifference/SymmetricDifferenceOpExecSpec.scala b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/symmetricDifference/SymmetricDifferenceOpExecSpec.scala index d53e857285f..f0e45a713de 100644 --- a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/symmetricDifference/SymmetricDifferenceOpExecSpec.scala +++ b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/symmetricDifference/SymmetricDifferenceOpExecSpec.scala @@ -13,14 +13,10 @@ import edu.uci.ics.amber.core.tuple.{ class SymmetricDifferenceOpExecSpec extends AnyFlatSpec with BeforeAndAfter { var opExec: SymmetricDifferenceOpExec = _ var counter: Int = 0 - val schema: Schema = Schema - .builder() - .add( - new Attribute("field1", AttributeType.STRING), - new Attribute("field2", AttributeType.INTEGER), - new Attribute("field3", AttributeType.BOOLEAN) - ) - .build() + val schema: Schema = Schema() + .add(new Attribute("field1", AttributeType.STRING)) + .add(new Attribute("field2", AttributeType.INTEGER)) + .add(new Attribute("field3", AttributeType.BOOLEAN)) def tuple(): Tuple = { counter += 1 diff --git a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/typecasting/TypeCastingOpExecSpec.scala b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/typecasting/TypeCastingOpExecSpec.scala index 39b3b5e8d60..83487206e06 100644 --- a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/typecasting/TypeCastingOpExecSpec.scala +++ b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/typecasting/TypeCastingOpExecSpec.scala @@ -5,21 +5,18 @@ import edu.uci.ics.amber.util.JSONUtils.objectMapper import org.scalatest.BeforeAndAfter import org.scalatest.flatspec.AnyFlatSpec class TypeCastingOpExecSpec extends AnyFlatSpec with BeforeAndAfter { - val tupleSchema: Schema = Schema - .builder() + val tupleSchema: Schema = Schema() .add(new Attribute("field1", AttributeType.STRING)) .add(new Attribute("field2", AttributeType.INTEGER)) .add(new Attribute("field3", AttributeType.BOOLEAN)) .add(new Attribute("field4", AttributeType.LONG)) - .build() - val castToSchema: Schema = Schema - .builder() + val castToSchema: Schema = Schema() .add(new Attribute("field1", AttributeType.STRING)) .add(new Attribute("field2", AttributeType.STRING)) .add(new Attribute("field3", AttributeType.STRING)) .add(new Attribute("field4", AttributeType.LONG)) - .build() + val castingUnit1 = new TypeCastingUnit() castingUnit1.attribute = "field2" castingUnit1.resultType = AttributeType.STRING diff --git a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/unneststring/UnnestStringOpExecSpec.scala b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/unneststring/UnnestStringOpExecSpec.scala index d63b82900bb..29905bbccfc 100644 --- a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/unneststring/UnnestStringOpExecSpec.scala +++ b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/operator/unneststring/UnnestStringOpExecSpec.scala @@ -6,12 +6,10 @@ import edu.uci.ics.amber.util.JSONUtils.objectMapper import org.scalatest.BeforeAndAfter import org.scalatest.flatspec.AnyFlatSpec class UnnestStringOpExecSpec extends AnyFlatSpec with BeforeAndAfter { - val tupleSchema: Schema = Schema - .builder() + val tupleSchema: Schema = Schema() .add(new Attribute("field1", AttributeType.STRING)) .add(new Attribute("field2", AttributeType.INTEGER)) .add(new Attribute("field3", AttributeType.STRING)) - .build() val tuple: Tuple = Tuple .builder(tupleSchema) diff --git a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/util/ArrowUtilsSpec.scala b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/util/ArrowUtilsSpec.scala index 02367732418..bc20e11ab86 100644 --- a/core/workflow-operator/src/test/scala/edu/uci/ics/amber/util/ArrowUtilsSpec.scala +++ b/core/workflow-operator/src/test/scala/edu/uci/ics/amber/util/ArrowUtilsSpec.scala @@ -25,15 +25,13 @@ class ArrowUtilsSpec extends AnyFlatSpec { val timestamp = new ArrowType.Timestamp(TimeUnit.MILLISECOND, "UTC") val string: ArrowType.Utf8 = ArrowType.Utf8.INSTANCE - val texeraSchema: Schema = Schema - .builder() + val texeraSchema: Schema = Schema() .add("test-1", AttributeType.INTEGER) .add("test-2", AttributeType.LONG) .add("test-3", AttributeType.BOOLEAN) .add("test-4", AttributeType.DOUBLE) .add("test-5", AttributeType.TIMESTAMP) .add("test-6", AttributeType.STRING) - .build() val arrowSchema: org.apache.arrow.vector.types.pojo.Schema = new org.apache.arrow.vector.types.pojo.Schema( From f2aeb0a189922cc990040b24947ffb3efa85a511 Mon Sep 17 00:00:00 2001 From: Yicong Huang <17627829+Yicong-Huang@users.noreply.github.com> Date: Wed, 1 Jan 2025 13:46:21 -0800 Subject: [PATCH 09/10] Fix python udf source detection (#3189) PhysicalOp relies on the input port number to determine if an operator is a source operator. For Python UDF, from the changes in #3183, the input ports are not correctly associated with the PhysicalOp, causing all the Python UDFs to be recognized as source operators. This PR fixes the issue. --- .../architecture/pythonworker/PythonProxyClient.scala | 2 +- .../ics/amber/operator/udf/python/PythonUDFOpDescV2.scala | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/pythonworker/PythonProxyClient.scala b/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/pythonworker/PythonProxyClient.scala index c7dc6400c1e..61a1a2641d9 100644 --- a/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/pythonworker/PythonProxyClient.scala +++ b/core/amber/src/main/scala/edu/uci/ics/amber/engine/architecture/pythonworker/PythonProxyClient.scala @@ -69,7 +69,7 @@ class PythonProxyClient(portNumberPromise: Promise[Int], val actorId: ActorVirtu logger.warn( s"Failed to connect to Flight Server in this attempt, retrying after $UNIT_WAIT_TIME_MS ms... remaining attempts: ${MAX_TRY_COUNT - tryCount}" ) - flightClient.close() + if (flightClient != null) flightClient.close() Thread.sleep(UNIT_WAIT_TIME_MS) tryCount += 1 } diff --git a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/PythonUDFOpDescV2.scala b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/PythonUDFOpDescV2.scala index 1c070636ebd..802b8d7d544 100644 --- a/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/PythonUDFOpDescV2.scala +++ b/core/workflow-operator/src/main/scala/edu/uci/ics/amber/operator/udf/python/PythonUDFOpDescV2.scala @@ -93,7 +93,7 @@ class PythonUDFOpDescV2 extends LogicalOp { Map(operatorInfo.outputPorts.head.id -> outputSchema) } - if (workers > 1) { + val physicalOp = if (workers > 1) { PhysicalOp .oneToOnePhysicalOp( workflowId, @@ -112,7 +112,10 @@ class PythonUDFOpDescV2 extends LogicalOp { OpExecWithCode(code, "python") ) .withParallelizable(false) - }.withDerivePartition(_ => UnknownPartition()) + } + + physicalOp + .withDerivePartition(_ => UnknownPartition()) .withInputPorts(operatorInfo.inputPorts) .withOutputPorts(operatorInfo.outputPorts) .withPartitionRequirement(partitionRequirement) From 19644b4135a213f44a8dc454c108eff3d6c404d2 Mon Sep 17 00:00:00 2001 From: Shengquan Ni <13672781+shengquan-ni@users.noreply.github.com> Date: Mon, 6 Jan 2025 13:26:52 -0800 Subject: [PATCH 10/10] Fix CI failures by pining the ubuntu version for backend CI (#3194) The ubuntu-latest image has been updated to 24.04 from 22.04 in recent days. However, the new image is incompatible with libncurses5, requiring an upgrade to libncurses6. Unfortunately, after upgrading, sbt no longer functions as expected, an issue also documented here: [actions/setup-java#712](https://github.com/actions/setup-java/issues/712). It appears that the 24.04 image does not include sbt by default. This PR addresses the issue by pinning the image to ubuntu-22.04. We can revisit and update the version when the 24.04 image becomes more stable and resolves these compatibility problems. --- .github/workflows/github-action-build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/github-action-build.yml b/.github/workflows/github-action-build.yml index 351064fb4a2..4c07bd728ca 100644 --- a/.github/workflows/github-action-build.yml +++ b/.github/workflows/github-action-build.yml @@ -56,7 +56,7 @@ jobs: core: strategy: matrix: - os: [ ubuntu-latest ] + os: [ ubuntu-22.04 ] java-version: [ 11 ] runs-on: ${{ matrix.os }} env: