Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Integrate dkulib & allow empty input #9

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Changelog


## [Version 1.0.2](https://github.com/dataiku/dss-plugin-nlp-amazon-translation/releases/tag/v1.0.2) - Dkulib integration 2023-01

- ✨ Integrate dkulib and allow empty inputs

## [Version 1.0.1](https://github.com/dataiku/dss-plugin-nlp-amazon-translation/releases/tag/v1.0.1) - Move configuration parameters - 2020-08

- ✨ Move configuration parameters

## [Version 1.0.0](https://github.com/dataiku/dss-plugin-nlp-amazon-translation/releases/tag/v1.0.0) - Initial release - 2020-05

- ✨ Integration with the [Amazon Comprehend API](https://docs.aws.amazon.com/comprehend/index.html)
4 changes: 2 additions & 2 deletions code-env/python/spec/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
boto3==1.15.14
tqdm==4.50.1
tqdm==4.61.0
ratelimit==2.2.1
retry==0.9.2
more-itertools==8.5.0
more-itertools==8.8.0
37 changes: 18 additions & 19 deletions custom-recipes/amazon-comprehend-nlp-keyphrase-extraction/recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,17 @@
import json
from typing import List, Dict, AnyStr, Union

from retry import retry
from ratelimit import limits, RateLimitException
from retry import retry

import dataiku
from dataiku.customrecipe import get_recipe_config, get_input_names_for_role, get_output_names_for_role

from plugin_io_utils import ErrorHandlingEnum, validate_column_input
from dku_io_utils import set_column_description
from amazon_comprehend_api_client import API_EXCEPTIONS, batch_api_response_parser, get_client
from api_parallelizer import api_parallelizer
from amazon_comprehend_api_formatting import KeyPhraseExtractionAPIFormatter

from dkulib.dku_io_utils import set_column_descriptions
from dkulib.parallelizer import DataFrameParallelizer
from plugin_io_utils import ErrorHandlingEnum, validate_column_input

# ==============================================================================
# SETUP
Expand Down Expand Up @@ -41,9 +40,9 @@
validate_column_input(text_column, input_columns_names)

batch_kwargs = {
"api_support_batch": True,
"batch_support": True,
"batch_size": batch_size,
"batch_api_response_parser": batch_api_response_parser,
"batch_response_parser": batch_api_response_parser,
}
if text_language == "language_column":
batch_kwargs = {"api_support_batch": False}
Expand All @@ -53,12 +52,10 @@
client = get_client(api_configuration_preset)
column_prefix = "keyphrase_api"


# ==============================================================================
# RUN
# ==============================================================================


@retry((RateLimitException, OSError), delay=api_quota_period, tries=5)
@limits(calls=api_quota_rate_limit, period=api_quota_period)
def call_api_key_phrase_extraction(
Expand Down Expand Up @@ -87,18 +84,20 @@ def call_api_key_phrase_extraction(
responses = client.batch_detect_key_phrases(TextList=text_list, LanguageCode=text_language)
return responses

df_parallelizer = DataFrameParallelizer(
function=call_api_key_phrase_extraction,
error_handling=error_handling,
exceptions_to_catch=API_EXCEPTIONS,
parallel_workers=parallel_workers,
output_column_prefix=column_prefix,
**batch_kwargs
)

df = api_parallelizer(
input_df=input_df,
api_call_function=call_api_key_phrase_extraction,
api_exceptions=API_EXCEPTIONS,
column_prefix=column_prefix,
df = df_parallelizer.run(
input_df,
text_column=text_column,
text_language=text_language,
language_column=language_column,
parallel_workers=parallel_workers,
error_handling=error_handling,
**batch_kwargs
)

api_formatter = KeyPhraseExtractionAPIFormatter(
Expand All @@ -107,8 +106,8 @@ def call_api_key_phrase_extraction(
output_df = api_formatter.format_df(df)

output_dataset.write_with_schema(output_df)
set_column_description(
set_column_descriptions(
input_dataset=input_dataset,
output_dataset=output_dataset,
column_description_dict=api_formatter.column_description_dict,
column_descriptions=api_formatter.column_description_dict,
)
36 changes: 18 additions & 18 deletions custom-recipes/amazon-comprehend-nlp-language-detection/recipe.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
# -*- coding: utf-8 -*-
from typing import List, Dict, AnyStr

from retry import retry
from ratelimit import limits, RateLimitException
from retry import retry

import dataiku
from dataiku.customrecipe import get_recipe_config, get_input_names_for_role, get_output_names_for_role

from plugin_io_utils import ErrorHandlingEnum, validate_column_input
from dku_io_utils import set_column_description
from amazon_comprehend_api_client import API_EXCEPTIONS, batch_api_response_parser, get_client
from api_parallelizer import api_parallelizer
from amazon_comprehend_api_formatting import LanguageDetectionAPIFormatter

from dkulib.dku_io_utils import set_column_descriptions
from dkulib.parallelizer import DataFrameParallelizer
from plugin_io_utils import ErrorHandlingEnum, validate_column_input

# ==============================================================================
# SETUP
Expand All @@ -39,17 +38,15 @@
client = get_client(api_configuration_preset)
column_prefix = "lang_detect_api"
batch_kwargs = {
"api_support_batch": True,
"batch_support": True,
"batch_size": batch_size,
"batch_api_response_parser": batch_api_response_parser,
"batch_response_parser": batch_api_response_parser,
}


# ==============================================================================
# RUN
# ==============================================================================


@retry((RateLimitException, OSError), delay=api_quota_period, tries=5)
@limits(calls=api_quota_rate_limit, period=api_quota_period)
def call_api_language_detection(batch: List[Dict], text_column: AnyStr) -> List[Dict]:
Expand All @@ -58,25 +55,28 @@ def call_api_language_detection(batch: List[Dict], text_column: AnyStr) -> List[
return responses


df = api_parallelizer(
input_df=input_df,
api_call_function=call_api_language_detection,
api_exceptions=API_EXCEPTIONS,
column_prefix=column_prefix,
text_column=text_column,
parallel_workers=parallel_workers,
df_parallelizer = DataFrameParallelizer(
function=call_api_language_detection,
error_handling=error_handling,
exceptions_to_catch=API_EXCEPTIONS,
parallel_workers=parallel_workers,
output_column_prefix=column_prefix,
**batch_kwargs
)

df = df_parallelizer.run(
input_df,
text_column=text_column,
)

api_formatter = LanguageDetectionAPIFormatter(
input_df=input_df, column_prefix=column_prefix, error_handling=error_handling,
)
output_df = api_formatter.format_df(df)

output_dataset.write_with_schema(output_df)
set_column_description(
set_column_descriptions(
input_dataset=input_dataset,
output_dataset=output_dataset,
column_description_dict=api_formatter.column_description_dict,
column_descriptions=api_formatter.column_description_dict,
)
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,17 @@
import json
from typing import List, Dict, AnyStr, Union

from retry import retry
from ratelimit import limits, RateLimitException
from retry import retry

import dataiku
from dataiku.customrecipe import get_recipe_config, get_input_names_for_role, get_output_names_for_role

from plugin_io_utils import ErrorHandlingEnum, validate_column_input
from dku_io_utils import set_column_description
from amazon_comprehend_api_client import API_EXCEPTIONS, batch_api_response_parser, get_client
from api_parallelizer import api_parallelizer
from amazon_comprehend_api_formatting import EntityTypeEnum, NamedEntityRecognitionAPIFormatter

from dkulib.dku_io_utils import set_column_descriptions
from dkulib.parallelizer import DataFrameParallelizer
from plugin_io_utils import ErrorHandlingEnum, validate_column_input

# ==============================================================================
# SETUP
Expand Down Expand Up @@ -44,9 +43,9 @@
validate_column_input(text_column, input_columns_names)

batch_kwargs = {
"api_support_batch": True,
"batch_support": True,
"batch_size": batch_size,
"batch_api_response_parser": batch_api_response_parser,
"batch_response_parser": batch_api_response_parser,
}
if text_language == "language_column":
batch_kwargs = {"api_support_batch": False}
Expand All @@ -56,12 +55,10 @@
client = get_client(api_configuration_preset)
column_prefix = "entity_api"


# ==============================================================================
# RUN
# ==============================================================================


@retry((RateLimitException, OSError), delay=api_quota_period, tries=5)
@limits(calls=api_quota_rate_limit, period=api_quota_period)
def call_api_named_entity_recognition(
Expand Down Expand Up @@ -91,17 +88,20 @@ def call_api_named_entity_recognition(
return responses


df = api_parallelizer(
input_df=input_df,
api_call_function=call_api_named_entity_recognition,
api_exceptions=API_EXCEPTIONS,
column_prefix=column_prefix,
df_parallelizer = DataFrameParallelizer(
function=call_api_named_entity_recognition,
error_handling=error_handling,
exceptions_to_catch=API_EXCEPTIONS,
parallel_workers=parallel_workers,
output_column_prefix=column_prefix,
**batch_kwargs
)

df = df_parallelizer.run(
input_df,
text_column=text_column,
text_language=text_language,
language_column=language_column,
parallel_workers=parallel_workers,
error_handling=error_handling,
**batch_kwargs
)

api_formatter = NamedEntityRecognitionAPIFormatter(
Expand All @@ -114,8 +114,8 @@ def call_api_named_entity_recognition(
output_df = api_formatter.format_df(df)

output_dataset.write_with_schema(output_df)
set_column_description(
set_column_descriptions(
input_dataset=input_dataset,
output_dataset=output_dataset,
column_description_dict=api_formatter.column_description_dict,
column_descriptions=api_formatter.column_description_dict,
)
38 changes: 19 additions & 19 deletions custom-recipes/amazon-comprehend-nlp-sentiment-analysis/recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,17 @@
import json
from typing import List, Dict, AnyStr, Union

from retry import retry
from ratelimit import limits, RateLimitException
from retry import retry

import dataiku
from dataiku.customrecipe import get_recipe_config, get_input_names_for_role, get_output_names_for_role

from plugin_io_utils import ErrorHandlingEnum, validate_column_input
from dku_io_utils import set_column_description
from amazon_comprehend_api_client import API_EXCEPTIONS, batch_api_response_parser, get_client
from api_parallelizer import api_parallelizer
from amazon_comprehend_api_formatting import SentimentAnalysisAPIFormatter

from dkulib.dku_io_utils import set_column_descriptions
from dkulib.parallelizer import DataFrameParallelizer
from plugin_io_utils import ErrorHandlingEnum, validate_column_input

# ==============================================================================
# SETUP
Expand All @@ -40,9 +39,9 @@
validate_column_input(text_column, input_columns_names)

batch_kwargs = {
"api_support_batch": True,
"batch_support": True,
"batch_size": batch_size,
"batch_api_response_parser": batch_api_response_parser,
"batch_response_parser": batch_api_response_parser,
}
if text_language == "language_column":
batch_kwargs = {"api_support_batch": False}
Expand All @@ -52,12 +51,10 @@
client = get_client(api_configuration_preset)
column_prefix = "sentiment_api"


# ==============================================================================
# RUN
# ==============================================================================


@retry((RateLimitException, OSError), delay=api_quota_period, tries=5)
@limits(calls=api_quota_rate_limit, period=api_quota_period)
def call_api_sentiment_analysis(
Expand Down Expand Up @@ -87,17 +84,20 @@ def call_api_sentiment_analysis(
return responses


df = api_parallelizer(
input_df=input_df,
api_call_function=call_api_sentiment_analysis,
api_exceptions=API_EXCEPTIONS,
column_prefix=column_prefix,
df_parallelizer = DataFrameParallelizer(
function=call_api_sentiment_analysis,
error_handling=error_handling,
exceptions_to_catch=API_EXCEPTIONS,
parallel_workers=parallel_workers,
output_column_prefix=column_prefix,
**batch_kwargs
)

df = df_parallelizer.run(
input_df,
text_column=text_column,
text_language=text_language,
language_column=language_column,
parallel_workers=parallel_workers,
error_handling=error_handling,
**batch_kwargs
)

api_formatter = SentimentAnalysisAPIFormatter(
Expand All @@ -106,8 +106,8 @@ def call_api_sentiment_analysis(
output_df = api_formatter.format_df(df)

output_dataset.write_with_schema(output_df)
set_column_description(
set_column_descriptions(
input_dataset=input_dataset,
output_dataset=output_dataset,
column_description_dict=api_formatter.column_description_dict,
column_descriptions=api_formatter.column_description_dict,
)
4 changes: 2 additions & 2 deletions plugin.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"id": "amazon-comprehend-nlp",
"version": "1.0.1",
"version": "1.0.2",
"meta": {
"label": "Amazon Comprehend NLP",
"category": "Natural Language Processing",
Expand All @@ -16,4 +16,4 @@
],
"supportLevel": "TIER2_SUPPORT"
}
}
}
Loading